From 947b86391e759d966d69d7a9f871f5daa9064a08 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 1 Nov 2025 22:19:40 +0100 Subject: [PATCH 01/26] fix(network): dedupe concurrent connection attempts --- .../src/node/network_bridge/p2p_protoc.rs | 173 ++++++++++++++---- 1 file changed, 135 insertions(+), 38 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 6f7811b6c..9df16a277 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -397,7 +397,7 @@ impl P2pConnManager { .await?; // Wait for connection to be established (with timeout) - match timeout(Duration::from_secs(5), result.recv()).await { + match timeout(Duration::from_secs(20), result.recv()).await { Ok(Some(Ok(_))) => { // Connection established, try sending again // IMPORTANT: Use single get() call to avoid TOCTOU race @@ -492,13 +492,15 @@ impl P2pConnManager { "Cleaning up in-progress connection reservations" ); - for (addr, mut callback) in state.awaiting_connection.drain() { - tracing::debug!(%addr, "Notifying awaiting connection of shutdown"); + for (addr, mut callbacks) in state.awaiting_connection.drain() { + tracing::debug!(%addr, callbacks = callbacks.len(), "Notifying awaiting connection of shutdown"); // Best effort notification - ignore errors since we're shutting down anyway // The callback sender will handle cleanup on their side - let _ = callback - .send_result(Err(HandshakeError::ChannelClosed)) - .await; + for mut callback in callbacks.drain(..) { + let _ = callback + .send_result(Err(HandshakeError::ChannelClosed)) + .await; + } } tracing::info!("Cleanup complete, exiting event loop"); @@ -1020,24 +1022,107 @@ impl P2pConnManager { } tracing::debug!(tx = %tx, "Blocked addresses: {:?}, peer addr: {}", blocked_addrs, peer.addr); } - state.awaiting_connection.insert(peer.addr, callback); + match state.awaiting_connection.entry(peer.addr) { + std::collections::hash_map::Entry::Occupied(mut callbacks) => { + tracing::debug!( + tx = %tx, + remote = %peer.addr, + pending = callbacks.get().len(), + "Connection already pending, queuing additional requester" + ); + callbacks.get_mut().push(callback); + return Ok(()); + } + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(vec![callback]); + } + } let res = timeout( Duration::from_secs(10), handshake_handler_msg.establish_conn(peer.clone(), tx, is_gw), ) - .await - .inspect_err(|error| { - tracing::error!(tx = %tx, "Failed to establish connection: {:?}", error); - })?; + .await; match res { - Ok(()) => { - tracing::debug!(tx = %tx, + Ok(Ok(())) => { + tracing::debug!( + tx = %tx, "Successfully initiated connection process for peer: {:?}", peer ); Ok(()) } - Err(e) => Err(anyhow::Error::msg(e)), + Ok(Err(e)) => { + tracing::warn!( + tx = %tx, + remote = %peer.addr, + error = ?e, + "Handshake establish_conn returned error" + ); + if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { + let mut callbacks = callbacks.into_iter(); + if let Some(mut cb) = callbacks.next() { + cb.send_result(Err(e)) + .await + .inspect_err(|send_err| { + tracing::debug!( + remote = %peer.addr, + error = ?send_err, + "Failed to deliver handshake error to awaiting callback" + ); + }) + .ok(); + } + for mut cb in callbacks { + cb.send_result(Err(HandshakeError::ConnectionClosed(peer.addr))) + .await + .inspect_err(|send_err| { + tracing::debug!( + remote = %peer.addr, + error = ?send_err, + "Failed to deliver fallback handshake error to awaiting callback" + ); + }) + .ok(); + } + } + Ok(()) + } + Err(elapsed) => { + tracing::warn!( + tx = %tx, + remote = %peer.addr, + elapsed = ?elapsed, + "Timeout while establishing connection" + ); + if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { + let mut iter = callbacks.into_iter(); + if let Some(mut cb) = iter.next() { + cb.send_result(Err(HandshakeError::ConnectionClosed(peer.addr))) + .await + .inspect_err(|send_err| { + tracing::debug!( + remote = %peer.addr, + error = ?send_err, + "Failed to deliver connection timeout to awaiting callback" + ); + }) + .ok(); + } + for mut cb in iter { + cb.send_result(Err(HandshakeError::ChannelClosed)) + .await + .inspect_err(|send_err| { + tracing::debug!( + remote = %peer.addr, + error = ?send_err, + "Failed to deliver fallback connection timeout to awaiting callback" + ); + }) + .ok(); + } + } + Ok(()) + } } } @@ -1172,15 +1257,23 @@ impl P2pConnManager { return Err(error.into()); } } - if let Some(mut r) = state.awaiting_connection.remove(&peer_id.addr) { - // Don't propagate channel closed errors - just log and continue - // The receiver may have timed out or been cancelled, which shouldn't crash the node - r.send_result(Err(error)) - .await - .inspect_err(|e| { - tracing::warn!(%peer_id, "Failed to send connection error notification - receiver may have timed out: {:?}", e); - }) - .ok(); + if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { + let mut callbacks = callbacks.into_iter(); + if let Some(mut r) = callbacks.next() { + // Don't propagate channel closed errors - just log and continue + // The receiver may have timed out or been cancelled, which shouldn't crash the node + r.send_result(Err(error)) + .await + .inspect_err(|e| { + tracing::warn!(%peer_id, "Failed to send connection error notification - receiver may have timed out: {:?}", e); + }) + .ok(); + } + for mut r in callbacks { + if let Err(e) = r.send_result(Err(HandshakeError::ChannelClosed)).await { + tracing::debug!(%peer_id, "Failed to send fallback connection error notification: {:?}", e); + } + } } } HandshakeEvent::RemoveTransaction(tx) => { @@ -1188,10 +1281,12 @@ impl P2pConnManager { } HandshakeEvent::OutboundGatewayConnectionRejected { peer_id } => { tracing::info!(%peer_id, "Connection rejected by peer"); - if let Some(mut r) = state.awaiting_connection.remove(&peer_id.addr) { - // Don't propagate channel closed errors - just log and continue - if let Err(e) = r.send_result(Err(HandshakeError::ChannelClosed)).await { - tracing::debug!(%peer_id, "Failed to send rejection notification: {:?}", e); + if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { + for mut r in callbacks { + // Don't propagate channel closed errors - just log and continue + if let Err(e) = r.send_result(Err(HandshakeError::ChannelClosed)).await { + tracing::debug!(%peer_id, "Failed to send rejection notification: {:?}", e); + } } } } @@ -1225,8 +1320,8 @@ impl P2pConnManager { select_stream: &mut priority_select::ProductionPrioritySelectStream, remaining_checks: Option, ) -> anyhow::Result<()> { - if let Some(mut cb) = state.awaiting_connection.remove(&peer_id.addr) { - let peer_id = if let Some(peer_id) = self + if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { + let resolved_peer_id = if let Some(peer_id) = self .bridge .op_manager .ring @@ -1241,14 +1336,16 @@ impl P2pConnManager { let key = (*self.bridge.op_manager.ring.connection_manager.pub_key).clone(); PeerId::new(self_addr, key) }; - timeout( - Duration::from_secs(60), - cb.send_result(Ok((peer_id, remaining_checks))), - ) - .await - .inspect_err(|error| { - tracing::error!("Failed to send connection result: {:?}", error); - })??; + for mut cb in callbacks { + timeout( + Duration::from_secs(60), + cb.send_result(Ok((resolved_peer_id.clone(), remaining_checks))), + ) + .await + .inspect_err(|error| { + tracing::error!("Failed to send connection result: {:?}", error); + })??; + } } else { tracing::warn!(%peer_id, "No callback for connection established"); } @@ -1527,7 +1624,7 @@ struct EventListenerState { tx_to_client: HashMap>, client_waiting_transaction: Vec<(WaitingTransaction, HashSet)>, transient_conn: HashMap, - awaiting_connection: HashMap>, + awaiting_connection: HashMap>>, pending_op_results: HashMap>, } From 24b1ab2df4a6f56319213324151680d89ceb1da6 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sun, 2 Nov 2025 22:31:37 +0100 Subject: [PATCH 02/26] fix(network): stabilize subscription handling and gateway routing --- crates/core/src/client_events/mod.rs | 33 +- .../core/src/client_events/session_actor.rs | 344 ++++++++++-- crates/core/src/contract/executor/runtime.rs | 5 + crates/core/src/contract/handler.rs | 27 + crates/core/src/node/mod.rs | 14 +- .../core/src/node/network_bridge/handshake.rs | 1 + .../src/node/network_bridge/p2p_protoc.rs | 276 ++++++++-- crates/core/src/operations/connect.rs | 170 ++++-- crates/core/src/operations/get.rs | 128 ++++- crates/core/src/operations/put.rs | 134 +++-- crates/core/src/operations/subscribe.rs | 515 +++++++++++++++--- crates/core/src/operations/subscribe/tests.rs | 61 ++- crates/core/src/operations/update.rs | 236 +++++++- crates/core/src/ring/connection_manager.rs | 90 ++- crates/core/src/ring/mod.rs | 36 +- crates/core/src/ring/seeding.rs | 67 ++- crates/core/src/router/mod.rs | 20 +- .../core/src/transport/connection_handler.rs | 54 +- scripts/deploy-local-gateway.sh | 6 +- 19 files changed, 1832 insertions(+), 385 deletions(-) diff --git a/crates/core/src/client_events/mod.rs b/crates/core/src/client_events/mod.rs index c4db9c6a6..8fd6d42c4 100644 --- a/crates/core/src/client_events/mod.rs +++ b/crates/core/src/client_events/mod.rs @@ -1158,24 +1158,14 @@ async fn process_open_request( "Starting direct SUBSCRIBE operation (legacy mode)", ); - // Legacy mode: direct operation without deduplication - let op_id = - crate::node::subscribe(op_manager.clone(), key, Some(client_id)) - .await - .inspect_err(|err| { - tracing::error!("Subscribe error: {}", err); - })?; - - tracing::debug!( - request_id = %request_id, - transaction_id = %op_id, - operation = "subscribe", - "Request-Transaction correlation" - ); + // Legacy mode: generate transaction, register first, then run op + let tx = crate::message::Transaction::new::< + crate::operations::subscribe::SubscribeMsg, + >(); op_manager .ch_outbound - .waiting_for_transaction_result(op_id, client_id, request_id) + .waiting_for_transaction_result(tx, client_id, request_id) .await .inspect_err(|err| { tracing::error!( @@ -1183,6 +1173,19 @@ async fn process_open_request( err ); })?; + + crate::node::subscribe_with_id(op_manager.clone(), key, None, Some(tx)) + .await + .inspect_err(|err| { + tracing::error!("Subscribe error: {}", err); + })?; + + tracing::debug!( + request_id = %request_id, + transaction_id = %tx, + operation = "subscribe", + "Request-Transaction correlation" + ); } } _ => { diff --git a/crates/core/src/client_events/session_actor.rs b/crates/core/src/client_events/session_actor.rs index 2f0825b4a..6961659f3 100644 --- a/crates/core/src/client_events/session_actor.rs +++ b/crates/core/src/client_events/session_actor.rs @@ -3,9 +3,10 @@ //! This module provides a simplified session actor that manages client sessions //! and handles efficient 1→N result delivery to multiple clients. -use crate::client_events::{ClientId, HostResult, RequestId}; +use crate::client_events::{ClientId, HostResponse, HostResult, RequestId}; use crate::contract::{ClientResponsesSender, SessionMessage}; use crate::message::Transaction; +use freenet_stdlib::client_api::ContractResponse; use std::collections::{HashMap, HashSet}; use tokio::sync::mpsc; use tracing::debug; @@ -16,9 +17,25 @@ pub struct SessionActor { client_transactions: HashMap>, // Track RequestId correlation for each (Transaction, ClientId) pair client_request_ids: HashMap<(Transaction, ClientId), RequestId>, + pending_results: HashMap, client_responses: ClientResponsesSender, } +#[derive(Clone)] +struct PendingResult { + result: std::sync::Arc, + delivered_clients: HashSet, +} + +impl PendingResult { + fn new(result: std::sync::Arc) -> Self { + Self { + result, + delivered_clients: HashSet::new(), + } + } +} + impl SessionActor { /// Create a new session actor pub fn new( @@ -29,6 +46,7 @@ impl SessionActor { message_rx, client_transactions: HashMap::new(), client_request_ids: HashMap::new(), + pending_results: HashMap::new(), client_responses, } } @@ -74,6 +92,18 @@ impl SessionActor { request_id, self.client_transactions.get(&tx).map_or(0, |s| s.len()) ); + + if let Some(result_arc) = self.pending_results.get_mut(&tx).and_then(|pending| { + if pending.delivered_clients.insert(client_id) { + Some(pending.result.clone()) + } else { + None + } + }) { + let mut recipients = HashSet::new(); + recipients.insert(client_id); + self.deliver_result_to_clients(tx, recipients, result_arc); + } } SessionMessage::ClientDisconnect { client_id } => { self.cleanup_client_transactions(client_id); @@ -96,62 +126,115 @@ impl SessionActor { } } - /// CORE: 1→N Result Delivery with RequestId correlation - /// Optimized with Arc to minimize cloning overhead in 1→N delivery - async fn handle_result_delivery( + fn deliver_result_to_clients( &mut self, tx: Transaction, - result: std::sync::Arc, + waiting_clients: HashSet, + result: std::sync::Arc, ) { - tracing::info!("Session actor attempting to deliver result for transaction {}, registered transactions: {}", tx, self.client_transactions.len()); - if let Some(waiting_clients) = self.client_transactions.remove(&tx) { - let client_count = waiting_clients.len(); + let client_count = waiting_clients.len(); + tracing::info!( + "Delivering result for transaction {} to {} clients", + tx, + client_count + ); + + if let Ok(HostResponse::ContractResponse(ContractResponse::GetResponse { + key, + state, + .. + })) = result.as_ref() + { tracing::info!( - "Delivering result for transaction {} to {} clients", - tx, - client_count + "Contract GET response ready for delivery: contract={} bytes={}", + key, + state.as_ref().len() ); + } - // Optimized 1→N delivery with RequestId correlation - for client_id in waiting_clients { - // Look up the RequestId for this (transaction, client) pair - let request_id = - self.client_request_ids - .remove(&(tx, client_id)) - .unwrap_or_else(|| { - tracing::warn!( - "No RequestId found for transaction {} and client {}, using default", - tx, client_id - ); - RequestId::new() - }); - - if let Err(e) = - self.client_responses - .send((client_id, request_id, (*result).clone())) - { + // Optimized 1→N delivery with RequestId correlation + for client_id in waiting_clients { + // Look up the RequestId for this (transaction, client) pair + let request_id = self + .client_request_ids + .remove(&(tx, client_id)) + .unwrap_or_else(|| { tracing::warn!( - "Failed to deliver result to client {} (request {}): {}", - client_id, - request_id, - e - ); - } else { - tracing::debug!( - "Delivered result for transaction {} to client {} with request correlation {}", - tx, client_id, request_id + "No RequestId found for transaction {} and client {}, using default", + tx, + client_id ); - } - } + RequestId::new() + }); - if client_count > 1 { + if let Err(e) = self + .client_responses + .send((client_id, request_id, (*result).clone())) + { + tracing::warn!( + "Failed to deliver result to client {} (request {}): {}", + client_id, + request_id, + e + ); + } else { tracing::debug!( - "Successfully delivered result for transaction {} to {} clients via optimized 1→N fanout with RequestId correlation", - tx, client_count + "Delivered result for transaction {} to client {} with request correlation {}", + tx, + client_id, + request_id ); } + } + + if client_count > 1 { + tracing::debug!( + "Successfully delivered result for transaction {} to {} clients via optimized 1→N fanout with RequestId correlation", + tx, + client_count + ); + } + } + + /// CORE: 1→N Result Delivery with RequestId correlation + /// Optimized with Arc to minimize cloning overhead in 1→N delivery + async fn handle_result_delivery( + &mut self, + tx: Transaction, + result: std::sync::Arc, + ) { + tracing::info!( + "Session actor attempting to deliver result for transaction {}, registered transactions: {}", + tx, + self.client_transactions.len() + ); + + let mut recipients = HashSet::new(); + let result_to_deliver = { + let entry = self + .pending_results + .entry(tx) + .or_insert_with(|| PendingResult::new(result.clone())); + entry.result = result.clone(); + + if let Some(waiting_clients) = self.client_transactions.remove(&tx) { + for client_id in waiting_clients { + if entry.delivered_clients.insert(client_id) { + recipients.insert(client_id); + } + } + } + + entry.result.clone() + }; + + if !recipients.is_empty() { + self.deliver_result_to_clients(tx, recipients, result_to_deliver); } else { - tracing::debug!("No clients waiting for transaction result: {}", tx); + tracing::debug!( + "No clients waiting for transaction result: {}, caching response for deferred delivery", + tx + ); } } @@ -199,6 +282,13 @@ impl SessionActor { e ); } else { + let entry = self + .pending_results + .entry(tx) + .or_insert_with(|| PendingResult::new(result.clone())); + entry.delivered_clients.insert(client_id); + entry.result = result.clone(); + tracing::debug!( "Delivered result for transaction {} to specific client {} with request correlation {}", tx, client_id, request_id @@ -358,6 +448,172 @@ mod tests { actor_handle.await.unwrap(); } + #[tokio::test] + async fn test_pending_result_reaches_late_registered_clients() { + use crate::contract::client_responses_channel; + use crate::operations::subscribe::SubscribeMsg; + use freenet_stdlib::client_api::{ContractResponse, HostResponse}; + use freenet_stdlib::prelude::{ContractInstanceId, ContractKey}; + + let (session_tx, session_rx) = mpsc::channel(100); + let (mut client_responses_rx, client_responses_tx) = client_responses_channel(); + let actor = SessionActor::new(session_rx, client_responses_tx); + + let actor_handle = tokio::spawn(async move { + actor.run().await; + }); + + let tx = Transaction::new::(); + let contract_key = ContractKey::from(ContractInstanceId::new([7u8; 32])); + let host_result = Ok(HostResponse::ContractResponse( + ContractResponse::SubscribeResponse { + key: contract_key, + subscribed: true, + }, + )); + + // Deliver result before any clients register; this models LocalSubscribeComplete firing + // before the session actor processes the pending subscription registration. + session_tx + .send(SessionMessage::DeliverHostResponse { + tx, + response: std::sync::Arc::new(host_result.clone()), + }) + .await + .unwrap(); + + // First client registers and should receive the cached result. + let client_one = ClientId::FIRST; + let request_one = RequestId::new(); + session_tx + .send(SessionMessage::RegisterTransaction { + tx, + client_id: client_one, + request_id: request_one, + }) + .await + .unwrap(); + + let (delivered_client_one, delivered_request_one, delivered_result_one) = + tokio::time::timeout( + tokio::time::Duration::from_millis(200), + client_responses_rx.recv(), + ) + .await + .expect("session actor failed to deliver cached result to first client") + .expect("client response channel closed unexpectedly"); + assert_eq!(delivered_client_one, client_one); + assert_eq!(delivered_request_one, request_one); + match delivered_result_one { + Ok(HostResponse::ContractResponse(ContractResponse::SubscribeResponse { + key, + subscribed, + })) => { + assert_eq!(key, contract_key); + assert!(subscribed); + } + other => panic!("unexpected result delivered to first client: {:?}", other), + } + + // Second client registers later; we expect the cached result to still be available. + let client_two = ClientId::next(); + let request_two = RequestId::new(); + session_tx + .send(SessionMessage::RegisterTransaction { + tx, + client_id: client_two, + request_id: request_two, + }) + .await + .unwrap(); + + let (delivered_client_two, delivered_request_two, delivered_result_two) = + tokio::time::timeout( + tokio::time::Duration::from_millis(200), + client_responses_rx.recv(), + ) + .await + .expect("pending result was not delivered to late-registered client") + .expect("client response channel closed unexpectedly for late registrant"); + assert_eq!(delivered_client_two, client_two); + assert_eq!(delivered_request_two, request_two); + match delivered_result_two { + Ok(HostResponse::ContractResponse(ContractResponse::SubscribeResponse { + key, + subscribed, + })) => { + assert_eq!(key, contract_key); + assert!(subscribed); + } + other => panic!( + "unexpected result delivered to late-registered client: {:?}", + other + ), + } + + actor_handle.abort(); + } + + #[tokio::test] + async fn test_pending_result_delivered_after_registration() { + use crate::contract::client_responses_channel; + + let (session_tx, session_rx) = mpsc::channel(100); + let (mut client_responses_rx, client_responses_tx) = client_responses_channel(); + let actor = SessionActor::new(session_rx, client_responses_tx); + + let actor_handle = tokio::spawn(async move { + actor.run().await; + }); + + let tx = Transaction::new::(); + let client_id = ClientId::FIRST; + let request_id = RequestId::new(); + let host_result = Arc::new(Ok(HostResponse::Ok)); + + session_tx + .send(SessionMessage::DeliverHostResponse { + tx, + response: host_result.clone(), + }) + .await + .unwrap(); + + // Ensure the actor processes the pending result before registration. + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + + session_tx + .send(SessionMessage::RegisterTransaction { + tx, + client_id, + request_id, + }) + .await + .unwrap(); + + let delivered = tokio::time::timeout( + tokio::time::Duration::from_millis(200), + client_responses_rx.recv(), + ) + .await + .expect("Timed out waiting for pending result delivery") + .expect("Client response channel closed unexpectedly"); + + let (returned_client, returned_request, returned_result) = delivered; + assert_eq!(returned_client, client_id); + assert_eq!(returned_request, request_id); + match returned_result { + Ok(HostResponse::Ok) => {} + other => panic!( + "Unexpected result delivered. got={:?}, expected=Ok(HostResponse::Ok)", + other + ), + } + + drop(session_tx); + actor_handle.await.unwrap(); + } + #[tokio::test] async fn test_session_actor_client_disconnect_cleanup() { use crate::contract::client_responses_channel; diff --git a/crates/core/src/contract/executor/runtime.rs b/crates/core/src/contract/executor/runtime.rs index e3102a8fa..c67d46247 100644 --- a/crates/core/src/contract/executor/runtime.rs +++ b/crates/core/src/contract/executor/runtime.rs @@ -879,6 +879,11 @@ impl Executor { .await .map_err(ExecutorError::other)?; + tracing::info!( + "Contract state updated for {key}, new_size_bytes={}", + new_state.as_ref().len() + ); + if let Err(err) = self .send_update_notification(key, parameters, &new_state) .await diff --git a/crates/core/src/contract/handler.rs b/crates/core/src/contract/handler.rs index 4acdba4a6..699f59c73 100644 --- a/crates/core/src/contract/handler.rs +++ b/crates/core/src/contract/handler.rs @@ -343,6 +343,33 @@ impl ContractHandlerChannel { Ok(()) } + + pub async fn waiting_for_subscription_result( + &self, + tx: Transaction, + contract_key: ContractInstanceId, + client_id: ClientId, + request_id: RequestId, + ) -> Result<(), ContractError> { + self.end + .wait_for_res_tx + .send((client_id, WaitingTransaction::Subscription { contract_key })) + .await + .map_err(|_| ContractError::NoEvHandlerResponse)?; + + if let Some(session_tx) = &self.session_adapter_tx { + let msg = SessionMessage::RegisterTransaction { + tx, + client_id, + request_id, + }; + if let Err(e) = session_tx.try_send(msg) { + tracing::warn!("Failed to notify session actor: {}", e); + } + } + + Ok(()) + } } impl ContractHandlerChannel { diff --git a/crates/core/src/node/mod.rs b/crates/core/src/node/mod.rs index f3a4b165a..c50ac8be1 100644 --- a/crates/core/src/node/mod.rs +++ b/crates/core/src/node/mod.rs @@ -37,10 +37,7 @@ use self::p2p_impl::NodeP2P; use crate::{ client_events::{BoxedClient, ClientEventsProxy, ClientId, OpenRequest}, config::{Address, GatewayConfig, WebsocketApiConfig}, - contract::{ - Callback, ExecutorError, ExecutorToEventLoopChannel, NetworkContractHandler, - WaitingTransaction, - }, + contract::{Callback, ExecutorError, ExecutorToEventLoopChannel, NetworkContractHandler}, local_node::Executor, message::{InnerMessage, NetMessage, Transaction, TransactionType}, operations::{ @@ -1105,6 +1102,7 @@ async fn handle_pure_network_result( } /// Attempts to subscribe to a contract +#[allow(dead_code)] pub async fn subscribe( op_manager: Arc, key: ContractKey, @@ -1131,13 +1129,7 @@ pub async fn subscribe_with_id( let request_id = RequestId::new(); let _ = op_manager .ch_outbound - .waiting_for_transaction_result( - WaitingTransaction::Subscription { - contract_key: *key.id(), - }, - client_id, - request_id, - ) + .waiting_for_subscription_result(id, *key.id(), client_id, request_id) .await; } // Initialize a subscribe op. diff --git a/crates/core/src/node/network_bridge/handshake.rs b/crates/core/src/node/network_bridge/handshake.rs index 8b58402bc..051acebdb 100644 --- a/crates/core/src/node/network_bridge/handshake.rs +++ b/crates/core/src/node/network_bridge/handshake.rs @@ -669,6 +669,7 @@ fn handle_outbound_result( } Err((peer_id, error)) => { tracing::debug!(from=%peer_id.addr, "Outbound connection failed: {error}"); + tracing::info!(from=%peer_id.addr, error = ?error, "Outbound connection failed"); handler.connecting.remove(&peer_id.addr); handler.outbound_messages.remove(&peer_id.addr); handler.connection_manager.prune_alive_connection(&peer_id); diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 9df16a277..b0db33ff4 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -384,6 +384,15 @@ impl P2pConnManager { // Queue the message for sending after connection is established let tx = *msg.id(); let (callback, mut result) = tokio::sync::mpsc::channel(10); + let target_peer_id = target_peer.peer.clone(); + let msg_clone = msg.clone(); + let bridge_sender = ctx.bridge.ev_listener_tx.clone(); + let self_peer_id = ctx + .bridge + .op_manager + .ring + .connection_manager + .get_peer_key(); // Initiate connection to the peer ctx.bridge @@ -396,49 +405,61 @@ impl P2pConnManager { })) .await?; - // Wait for connection to be established (with timeout) - match timeout(Duration::from_secs(20), result.recv()).await { - Ok(Some(Ok(_))) => { - // Connection established, try sending again - // IMPORTANT: Use single get() call to avoid TOCTOU race - let peer_connection_retry = - ctx.connections.get(&target_peer.peer); - tracing::debug!( - tx = %msg.id(), - self_peer = %ctx.bridge.op_manager.ring.connection_manager.pub_key, - target = %target_peer.peer, - conn_map_size = ctx.connections.len(), - has_connection = peer_connection_retry.is_some(), - "[CONN_TRACK] LOOKUP: Retry after connection established - checking for connection in HashMap" - ); - if let Some(peer_connection) = peer_connection_retry { - if let Err(e) = - peer_connection.send(Left(msg)).await + tracing::info!( + tx = %tx, + target = %target_peer_id, + "connect_peer: dispatched connect request, waiting asynchronously" + ); + + tokio::spawn(async move { + match timeout(Duration::from_secs(20), result.recv()).await + { + Ok(Some(Ok(_))) => { + tracing::info!( + tx = %tx, + target = %target_peer_id, + self_peer = ?self_peer_id, + "connect_peer: connection established, rescheduling message send" + ); + if let Err(e) = bridge_sender + .send(Left(( + target_peer_id.clone(), + Box::new(msg_clone), + ))) + .await { - tracing::error!("Failed to send message to peer after establishing connection: {}", e); + tracing::error!( + tx = %tx, + target = %target_peer_id, + "connect_peer: failed to reschedule message after connection: {:?}", + e + ); } - } else { + } + Ok(Some(Err(e))) => { tracing::error!( tx = %tx, - target = %target_peer.peer, - "Connection established successfully but not found in HashMap - possible race condition" + target = %target_peer_id, + "connect_peer: connection attempt returned error: {:?}", + e + ); + } + Ok(None) => { + tracing::error!( + tx = %tx, + target = %target_peer_id, + "connect_peer: response channel closed before connection result" + ); + } + Err(_) => { + tracing::error!( + tx = %tx, + target = %target_peer_id, + "connect_peer: timeout waiting for connection result" ); } } - Ok(Some(Err(e))) => { - tracing::error!( - "Failed to establish connection to {}: {:?}", - target_peer.peer, - e - ); - } - Ok(None) | Err(_) => { - tracing::error!( - "Timeout or error establishing connection to {}", - target_peer.peer - ); - } - } + }); } } } @@ -810,7 +831,35 @@ impl P2pConnManager { match op_manager.result_router_tx.send((tx, response)).await { Ok(()) => { tracing::debug!(%tx, "sent subscribe response to client"); - state.tx_to_client.remove(&tx); + if let Some(clients) = state.tx_to_client.remove(&tx) { + tracing::debug!( + "LocalSubscribeComplete removed {} waiting clients for transaction {}", + clients.len(), + tx + ); + } else if let Some(pos) = state + .client_waiting_transaction + .iter() + .position(|(waiting, _)| match waiting { + WaitingTransaction::Subscription { contract_key } => { + contract_key == key.id() + } + _ => false, + }) + { + let (_, clients) = state.client_waiting_transaction.remove(pos); + tracing::debug!( + "LocalSubscribeComplete for {} matched {} subscription waiters via contract {}", + tx, + clients.len(), + key + ); + } else { + tracing::warn!( + "LocalSubscribeComplete for {} found no waiting clients", + tx + ); + } } Err(e) => { tracing::error!(%tx, error = %e, "failed to send subscribe response") @@ -1022,21 +1071,52 @@ impl P2pConnManager { } tracing::debug!(tx = %tx, "Blocked addresses: {:?}, peer addr: {}", blocked_addrs, peer.addr); } - match state.awaiting_connection.entry(peer.addr) { + let peer_addr = peer.addr; + match state.awaiting_connection.entry(peer_addr) { std::collections::hash_map::Entry::Occupied(mut callbacks) => { + let txs_entry = state.awaiting_connection_txs.entry(peer_addr).or_default(); + if !txs_entry.contains(&tx) { + txs_entry.push(tx); + } tracing::debug!( tx = %tx, - remote = %peer.addr, + remote = %peer_addr, pending = callbacks.get().len(), "Connection already pending, queuing additional requester" ); callbacks.get_mut().push(callback); + tracing::info!( + tx = %tx, + remote = %peer_addr, + pending = callbacks.get().len(), + pending_txs = ?txs_entry, + "connect_peer: connection already pending, queued callback" + ); return Ok(()); } std::collections::hash_map::Entry::Vacant(entry) => { + let txs_entry = state.awaiting_connection_txs.entry(peer_addr).or_default(); + txs_entry.push(tx); + tracing::debug!( + tx = %tx, + remote = %peer_addr, + "connect_peer: registering new pending connection" + ); entry.insert(vec![callback]); + tracing::info!( + tx = %tx, + remote = %peer_addr, + pending = 1, + pending_txs = ?txs_entry, + "connect_peer: registered new pending connection" + ); } } + tracing::debug!( + tx = %tx, + remote = %peer.addr, + "connect_peer: dispatching establish_conn" + ); let res = timeout( Duration::from_secs(10), handshake_handler_msg.establish_conn(peer.clone(), tx, is_gw), @@ -1049,6 +1129,26 @@ impl P2pConnManager { "Successfully initiated connection process for peer: {:?}", peer ); + if let Some(callbacks) = state.awaiting_connection.get(&peer.addr) { + tracing::debug!( + tx = %tx, + remote = %peer_addr, + pending_callbacks = callbacks.len(), + "connect_peer: handshake in flight" + ); + let pending_txs = state + .awaiting_connection_txs + .get(&peer_addr) + .cloned() + .unwrap_or_default(); + tracing::info!( + tx = %tx, + remote = %peer_addr, + pending_callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "connect_peer: handshake initiated - awaiting completion" + ); + } Ok(()) } Ok(Err(e)) => { @@ -1058,7 +1158,24 @@ impl P2pConnManager { error = ?e, "Handshake establish_conn returned error" ); + let pending_txs = state + .awaiting_connection_txs + .remove(&peer.addr) + .unwrap_or_default(); if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { + tracing::debug!( + tx = %tx, + remote = %peer.addr, + callbacks = callbacks.len(), + "Handshake establish_conn returned error - notifying callbacks" + ); + tracing::info!( + tx = %tx, + remote = %peer.addr, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "connect_peer: handshake errored - notifying callbacks" + ); let mut callbacks = callbacks.into_iter(); if let Some(mut cb) = callbacks.next() { cb.send_result(Err(e)) @@ -1094,7 +1211,24 @@ impl P2pConnManager { elapsed = ?elapsed, "Timeout while establishing connection" ); + let pending_txs = state + .awaiting_connection_txs + .remove(&peer.addr) + .unwrap_or_default(); if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { + tracing::debug!( + tx = %tx, + remote = %peer.addr, + callbacks = callbacks.len(), + "Handshake timed out - notifying callbacks" + ); + tracing::info!( + tx = %tx, + remote = %peer.addr, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "connect_peer: handshake timed out - notifying callbacks" + ); let mut iter = callbacks.into_iter(); if let Some(mut cb) = iter.next() { cb.send_result(Err(HandshakeError::ConnectionClosed(peer.addr))) @@ -1257,7 +1391,22 @@ impl P2pConnManager { return Err(error.into()); } } + let pending_txs = state + .awaiting_connection_txs + .remove(&peer_id.addr) + .unwrap_or_default(); if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { + tracing::debug!( + %peer_id, + callbacks = callbacks.len(), + "HandshakeEvent::OutboundConnectionFailed - notifying callbacks" + ); + tracing::info!( + %peer_id, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "HandshakeEvent::OutboundConnectionFailed - notifying callbacks" + ); let mut callbacks = callbacks.into_iter(); if let Some(mut r) = callbacks.next() { // Don't propagate channel closed errors - just log and continue @@ -1281,7 +1430,22 @@ impl P2pConnManager { } HandshakeEvent::OutboundGatewayConnectionRejected { peer_id } => { tracing::info!(%peer_id, "Connection rejected by peer"); + let pending_txs = state + .awaiting_connection_txs + .remove(&peer_id.addr) + .unwrap_or_default(); if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { + tracing::debug!( + %peer_id, + callbacks = callbacks.len(), + "HandshakeEvent::OutboundGatewayConnectionRejected - notifying callbacks" + ); + tracing::info!( + %peer_id, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "HandshakeEvent::OutboundGatewayConnectionRejected - notifying callbacks" + ); for mut r in callbacks { // Don't propagate channel closed errors - just log and continue if let Err(e) = r.send_result(Err(HandshakeError::ChannelClosed)).await { @@ -1320,6 +1484,10 @@ impl P2pConnManager { select_stream: &mut priority_select::ProductionPrioritySelectStream, remaining_checks: Option, ) -> anyhow::Result<()> { + let pending_txs = state + .awaiting_connection_txs + .remove(&peer_id.addr) + .unwrap_or_default(); if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { let resolved_peer_id = if let Some(peer_id) = self .bridge @@ -1336,6 +1504,18 @@ impl P2pConnManager { let key = (*self.bridge.op_manager.ring.connection_manager.pub_key).clone(); PeerId::new(self_addr, key) }; + tracing::debug!( + remote = %peer_id.addr, + callbacks = callbacks.len(), + "handle_successful_connection: notifying waiting callbacks" + ); + tracing::info!( + remote = %peer_id.addr, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + remaining_checks = ?remaining_checks, + "handle_successful_connection: connection established" + ); for mut cb in callbacks { timeout( Duration::from_secs(60), @@ -1347,7 +1527,11 @@ impl P2pConnManager { })??; } } else { - tracing::warn!(%peer_id, "No callback for connection established"); + tracing::warn!( + %peer_id, + pending_txs = ?pending_txs, + "No callback for connection established" + ); } // Only insert if connection doesn't already exist to avoid dropping existing channel @@ -1538,7 +1722,15 @@ impl P2pConnManager { match transaction { WaitingTransaction::Transaction(tx) => { tracing::debug!(%tx, %client_id, "Subscribing client to transaction results"); - state.tx_to_client.entry(tx).or_default().insert(client_id); + let entry = state.tx_to_client.entry(tx).or_default(); + let inserted = entry.insert(client_id); + tracing::debug!( + "tx_to_client: tx={} client={} inserted={} total_waiting_clients={}", + tx, + client_id, + inserted, + entry.len() + ); } WaitingTransaction::Subscription { contract_key } => { tracing::debug!(%client_id, %contract_key, "Client waiting for subscription"); @@ -1625,6 +1817,7 @@ struct EventListenerState { client_waiting_transaction: Vec<(WaitingTransaction, HashSet)>, transient_conn: HashMap, awaiting_connection: HashMap>>, + awaiting_connection_txs: HashMap>, pending_op_results: HashMap>, } @@ -1637,6 +1830,7 @@ impl EventListenerState { transient_conn: HashMap::new(), awaiting_connection: HashMap::new(), pending_op_results: HashMap::new(), + awaiting_connection_txs: HashMap::new(), } } } diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 9b72194d9..589c27afc 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -170,6 +170,11 @@ impl Operation for ConnectOp { id, .. } => { + let ring_max_htl = op_manager.ring.max_hops_to_live.max(1); + let mut max_hops = (*max_hops_to_live).min(ring_max_htl); + if max_hops == 0 { + max_hops = 1; + } let own_loc = op_manager.ring.connection_manager.own_location(); let PeerKeyLocation { peer: this_peer, @@ -213,8 +218,8 @@ impl Operation for ConnectOp { &own_loc, joiner, &desirable_peer, - *max_hops_to_live, - *max_hops_to_live, + max_hops, + max_hops, skip_connections, skip_forwards, ); @@ -253,7 +258,7 @@ impl Operation for ConnectOp { ); debug_assert_eq!(this_peer, &joiner.peer); new_state = Some(ConnectState::AwaitingNewConnection(NewConnectionInfo { - remaining_connections: *max_hops_to_live, + remaining_connections: max_hops, })); let msg = ConnectMsg::Request { id: *id, @@ -262,7 +267,7 @@ impl Operation for ConnectOp { query_target: query_target.clone(), ideal_location: *ideal_location, joiner: joiner.clone(), - max_hops_to_live: *max_hops_to_live, + max_hops_to_live: max_hops, skip_connections, skip_forwards, }, @@ -286,6 +291,21 @@ impl Operation for ConnectOp { .. } => { let this_peer = op_manager.ring.connection_manager.own_location(); + let ring_max_htl = op_manager.ring.max_hops_to_live.max(1); + let mut max_htl = (*max_hops_to_live).min(ring_max_htl); + if max_htl == 0 { + max_htl = 1; + } + let mut hops_left = (*hops_to_live).min(max_htl); + if hops_left == 0 { + tracing::warn!( + tx = %id, + sender = %sender.peer, + joiner = %joiner.peer, + "Received CheckConnectivity with zero hops to live; clamping to 1" + ); + hops_left = 1; + } if sender.peer == joiner.peer { tracing::error!( tx = %id, @@ -313,7 +333,7 @@ impl Operation for ConnectOp { tracing::debug!( tx = %id, at = %this_peer.peer, - hops_to_live = %hops_to_live, + hops_to_live = %hops_left, joiner = %joiner, "Checking connectivity request received" ); @@ -334,29 +354,37 @@ impl Operation for ConnectOp { is_gw: false, }) .await?; - if result - .recv() - .await - .ok_or(OpError::NotificationError)? - .is_ok() - { - let was_reserved = { - // reserved just above in call to should_accept + match result.recv().await.ok_or(OpError::NotificationError)? { + Ok((peer_id, remaining_checks)) => { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + connected_peer = %peer_id, + remaining_checks, + "ConnectPeer completed successfully" + ); + let was_reserved = { + // reserved just above in call to should_accept + true + }; + op_manager + .ring + .add_connection(joiner_loc, joiner.peer.clone(), was_reserved) + .await; true - }; - // Add the connection to the ring - op_manager - .ring - .add_connection(joiner_loc, joiner.peer.clone(), was_reserved) - .await; - true - } else { - // If the connection was not completed, prune the reserved connection - op_manager - .ring - .connection_manager - .prune_in_transit_connection(&joiner.peer); - false + } + Err(()) => { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + "ConnectPeer failed to establish connection" + ); + op_manager + .ring + .connection_manager + .prune_in_transit_connection(&joiner.peer); + false + } } } else { tracing::debug!(tx = %id, at = %this_peer.peer, from = %joiner, "Rejecting connection"); @@ -372,8 +400,8 @@ impl Operation for ConnectOp { op_manager.ring.router.clone(), network_bridge, ForwardParams { - left_htl: *hops_to_live, - max_htl: *max_hops_to_live, + left_htl: hops_left, + max_htl, accepted: should_accept, skip_connections: skip_connections.clone(), skip_forwards: skip_forwards.clone(), @@ -441,6 +469,13 @@ impl Operation for ConnectOp { connected_to = %acceptor.peer, "Open connection acknowledged at requesting joiner peer", ); + tracing::info!( + tx = %id, + joiner = %this_peer_id, + acceptor = %acceptor.peer, + location = ?acceptor.location, + "Connect response accepted; registering connection" + ); info.accepted_by.insert(acceptor.clone()); op_manager .ring @@ -458,6 +493,12 @@ impl Operation for ConnectOp { rejected_peer = %acceptor.peer, "Connection rejected", ); + tracing::info!( + tx = %id, + joiner = %this_peer_id, + rejector = %acceptor.peer, + "Connect response rejected by peer" + ); } let your_location: Location = @@ -472,6 +513,12 @@ impl Operation for ConnectOp { .ring .connection_manager .update_location(target.location); + tracing::info!( + tx = %id, + at = %this_peer_id, + new_location = ?target.location, + "Updated joiner location from connect response" + ); if remaining_connections == 0 { tracing::debug!( @@ -1073,7 +1120,7 @@ where let num_reserved = connection_manager.get_reserved_connections(); let max_connections = connection_manager.max_connections; - tracing::debug!( + tracing::info!( tx = %id, joiner = %joiner.peer, num_connections = %num_connections, @@ -1098,10 +1145,9 @@ where // // See PR #1871 discussion with @iduartgomez for context. // - // IMPORTANT (issue #1908): Extended to cover early network formation (first few peers) - // During early network formation, the gateway should accept connections directly to ensure - // bidirectional connections are established. Without this, peers 2+ only get unidirectional - // connections (peer → gateway) but not the reverse (gateway → peer). + // IMPORTANT (issue #1908): Extended to cover early network formation (only the very first peer). + // During bootstrap we keep the first connection direct to guarantee bidirectional connectivity; + // subsequent peers should be forwarded through existing nodes. // // However, we still respect max_connections - this only applies when there's capacity. const EARLY_NETWORK_THRESHOLD: usize = 4; @@ -1160,6 +1206,13 @@ where match target_peer { Some(target_peer) => { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + next_hop = %target_peer.peer, + htl = left_htl, + "forward_conn: forwarding connection request to peer candidate" + ); // Successfully found a peer to forward to let forward_msg = create_forward_message( id, @@ -1182,11 +1235,13 @@ where } None => { // Couldn't find suitable peer to forward to - tracing::debug!( + tracing::info!( tx = %id, joiner = %joiner.peer, - "No suitable peer found for forwarding despite having {} connections", - num_connections + skip_count = skip_forwards.len(), + connections = num_connections, + accepted_flag = %accepted, + "forward_conn: no suitable peer found for forwarding despite available connections" ); return Ok(None); } @@ -1209,6 +1264,9 @@ fn select_forward_target( // Create an extended skip list that includes the joiner to prevent forwarding to the joiner let mut extended_skip = skip_forwards.clone(); extended_skip.insert(joiner.peer.clone()); + if let Some(self_peer) = connection_manager.get_peer_key() { + extended_skip.insert(self_peer); + } if left_htl >= connection_manager.rnd_if_htl_above { tracing::debug!( @@ -1216,21 +1274,53 @@ fn select_forward_target( joiner = %joiner.peer, "Randomly selecting peer to forward connect request", ); - connection_manager.random_peer(|p| !extended_skip.contains(p)) + let candidate = connection_manager.random_peer(|p| !extended_skip.contains(p)); + if candidate.is_none() { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + skip = ?extended_skip, + "select_forward_target: random selection found no candidate" + ); + } else if let Some(ref c) = candidate { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + next_hop = %c.peer, + "select_forward_target: random candidate selected" + ); + } + candidate } else { tracing::debug!( tx = %id, joiner = %joiner.peer, "Selecting close peer to forward request", ); - connection_manager + let candidate = connection_manager .routing( joiner.location.unwrap(), Some(&request_peer.peer), &extended_skip, router, ) - .and_then(|pkl| (pkl.peer != joiner.peer).then_some(pkl)) + .and_then(|pkl| (pkl.peer != joiner.peer).then_some(pkl)); + if candidate.is_none() { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + skip = ?extended_skip, + "select_forward_target: router returned no candidate" + ); + } else if let Some(ref c) = candidate { + tracing::info!( + tx = %id, + joiner = %joiner.peer, + next_hop = %c.peer, + "select_forward_target: routing candidate selected" + ); + } + candidate } } diff --git a/crates/core/src/operations/get.rs b/crates/core/src/operations/get.rs index 3d1ba21d3..7f702d139 100644 --- a/crates/core/src/operations/get.rs +++ b/crates/core/src/operations/get.rs @@ -427,11 +427,20 @@ impl Operation for GetOp { GetMsg::RequestGet { key, id, - sender: _, + sender, target, fetch_contract, skip_list, } => { + tracing::info!( + tx = %id, + %key, + target = %target.peer, + sender = %sender.peer, + fetch_contract = *fetch_contract, + skip = ?skip_list, + "GET: received RequestGet" + ); // Check if operation is already completed if matches!(self.state, Some(GetState::Finished { .. })) { tracing::debug!( @@ -449,7 +458,13 @@ impl Operation for GetOp { Some(GetState::ReceivedRequest { .. }) | Some(GetState::AwaitingResponse { .. }) )); - tracing::info!(tx = %id, %key, target = %target.peer, "Seek contract"); + tracing::debug!( + tx = %id, + %key, + target = %target.peer, + "GET: RequestGet processing in state {:?}", + self.state + ); // Initialize stats for tracking the operation stats = Some(Box::new(GetStats { @@ -477,7 +492,12 @@ impl Operation for GetOp { .. }) => { // Contract found locally! - tracing::debug!(tx = %id, %key, "Contract found locally in RequestGet handler"); + tracing::info!( + tx = %id, + %key, + fetch_contract = *fetch_contract, + "GET: contract found locally in RequestGet handler" + ); // Check if this is a forwarded request or a local request match &self.state { @@ -516,24 +536,22 @@ impl Operation for GetOp { // Contract not found locally, proceed with forwarding tracing::debug!(tx = %id, %key, "Contract not found locally, forwarding to {}", target.peer); - // Keep current state - new_state = self.state; - // Prepare skip list with own peer ID let own_loc = op_manager.ring.connection_manager.own_location(); let mut new_skip_list = skip_list.clone(); new_skip_list.insert(own_loc.peer.clone()); - // Create seek node message - return_msg = Some(GetMsg::SeekNode { - key: *key, - id: *id, - target: target.clone(), - sender: own_loc.clone(), - fetch_contract: *fetch_contract, - htl: op_manager.ring.max_hops_to_live, - skip_list: new_skip_list, - }); + // Forward using standard routing helper + return try_forward_or_return( + *id, + *key, + (op_manager.ring.max_hops_to_live.max(1), *fetch_contract), + (target.clone(), sender.clone()), + new_skip_list, + op_manager, + stats, + ) + .await; } } } @@ -547,12 +565,39 @@ impl Operation for GetOp { htl, skip_list, } => { - let htl = *htl; + let ring_max_htl = op_manager.ring.max_hops_to_live.max(1); + let htl = (*htl).min(ring_max_htl); let id = *id; let key: ContractKey = *key; let fetch_contract = *fetch_contract; let this_peer = target.clone(); + if htl == 0 { + tracing::warn!( + tx = %id, + %key, + sender = %sender.peer, + "Dropping GET SeekNode with zero HTL" + ); + return build_op_result( + id, + None, + Some(GetMsg::ReturnGet { + id, + key, + value: StoreResponse { + state: None, + contract: None, + }, + sender: this_peer.clone(), + target: sender.clone(), + skip_list: skip_list.clone(), + }), + None, + stats, + ); + } + // Update stats with next peer if let Some(s) = stats.as_mut() { s.next_peer = Some(this_peer.clone()); @@ -658,6 +703,14 @@ impl Operation for GetOp { target, skip_list, } => { + tracing::info!( + tx = %id, + %key, + from = %sender.peer, + to = %target.peer, + skip = ?skip_list, + "GET: ReturnGet received with empty value" + ); // Handle case where neither contract nor state was found let this_peer = target; tracing::warn!( @@ -690,12 +743,16 @@ impl Operation for GetOp { // Try the next alternative let next_target = alternatives.remove(0); - tracing::debug!( + tracing::info!( tx = %id, - "Trying alternative peer {} at same hop level (attempt {}/{})", - next_target.peer, - attempts_at_hop + 1, - DEFAULT_MAX_BREADTH + %key, + next_peer = %next_target.peer, + fetch_contract, + attempts_at_hop = attempts_at_hop + 1, + max_attempts = DEFAULT_MAX_BREADTH, + tried = ?tried_peers, + remaining_alternatives = ?alternatives, + "Trying alternative peer at same hop level" ); return_msg = Some(GetMsg::SeekNode { @@ -733,6 +790,16 @@ impl Operation for GetOp { DEFAULT_MAX_BREADTH, ); + tracing::info!( + tx = %id, + %key, + new_candidates = ?new_candidates, + skip = ?new_skip_list, + hop = current_hop, + retries = retries + 1, + "GET seeking new candidates after exhausted alternatives" + ); + if !new_candidates.is_empty() { // Try with the best new peer let target = new_candidates.remove(0); @@ -767,6 +834,8 @@ impl Operation for GetOp { %key, %this_peer, target = %requester_peer, + tried = ?tried_peers, + skip = ?new_skip_list, "No other peers found while trying to get the contract, returning response to requester" ); return_msg = Some(GetMsg::ReturnGet { @@ -783,10 +852,13 @@ impl Operation for GetOp { } else { // Original requester, operation failed tracing::error!( - tx = %id, - "Failed getting a value for contract {}, reached max retries", - key - ); + tx = %id, + %key, + tried = ?tried_peers, + skip = ?skip_list, + "Failed getting a value for contract {}, reached max retries", + key + ); return_msg = None; new_state = None; result = Some(GetResult { @@ -810,6 +882,8 @@ impl Operation for GetOp { %key, %this_peer, target = %requester_peer, + tried = ?tried_peers, + skip = ?skip_list, "No other peers found while trying to get the contract, returning response to requester" ); return_msg = Some(GetMsg::ReturnGet { @@ -1165,7 +1239,7 @@ async fn try_forward_or_return( let mut new_skip_list = skip_list.clone(); new_skip_list.insert(this_peer.peer.clone()); - let new_htl = htl - 1; + let new_htl = htl.saturating_sub(1); let (new_target, alternatives) = if new_htl == 0 { tracing::warn!( diff --git a/crates/core/src/operations/put.rs b/crates/core/src/operations/put.rs index 2996bab9a..8b20fd811 100644 --- a/crates/core/src/operations/put.rs +++ b/crates/core/src/operations/put.rs @@ -173,6 +173,7 @@ impl Operation for PutOp { // Get the contract key and own location let key = contract.key(); let own_location = op_manager.ring.connection_manager.own_location(); + let prev_sender = sender.clone(); tracing::info!( "Requesting put for contract {} from {} to {}", @@ -208,7 +209,7 @@ impl Operation for PutOp { tracing::debug!( tx = %id, %key, - peer = %sender.peer, + peer = %prev_sender.peer, is_already_seeding, "Processing local PUT in initiating node" ); @@ -241,7 +242,7 @@ impl Operation for PutOp { tracing::debug!( tx = %id, %key, - peer = %sender.peer, + peer = %prev_sender.peer, "Marked contract as seeding locally" ); } @@ -249,7 +250,7 @@ impl Operation for PutOp { tracing::debug!( tx = %id, %key, - peer = %sender.peer, + peer = %prev_sender.peer, was_already_seeding = is_already_seeding, "Successfully processed contract locally with merge" ); @@ -267,9 +268,18 @@ impl Operation for PutOp { // Determine next forwarding target - find peers closer to the contract location // Don't reuse the target from RequestPut as that's US (the current processing peer) + let skip = [&prev_sender.peer]; let next_target = op_manager .ring - .closest_potentially_caching(&key, [&sender.peer].as_slice()); + .closest_potentially_caching(&key, skip.as_slice()); + + tracing::info!( + tx = %id, + %key, + next_target = ?next_target, + skip = ?skip, + "PUT seek evaluating next forwarding target" + ); if let Some(forward_target) = next_target { // Create a SeekNode message to forward to the next hop @@ -286,23 +296,24 @@ impl Operation for PutOp { // Transition to AwaitingResponse state to handle future SuccessfulPut messages new_state = Some(PutState::AwaitingResponse { key, - upstream: Some(sender.clone()), + upstream: Some(prev_sender.clone()), contract: contract.clone(), state: modified_value, subscribe, }); } else { // No other peers to forward to - we're the final destination - tracing::debug!( + tracing::warn!( tx = %id, %key, - "No peers to forward to - handling PUT completion locally, sending SuccessfulPut back to sender" + skip = ?skip, + "No peers to forward to after local processing - completing PUT locally" ); // Send SuccessfulPut back to the sender (upstream node) return_msg = Some(PutMsg::SuccessfulPut { id: *id, - target: sender.clone(), + target: prev_sender.clone(), key, sender: own_location.clone(), }); @@ -686,6 +697,20 @@ impl Operation for PutOp { skip_list, .. } => { + let max_htl = op_manager.ring.max_hops_to_live.max(1); + let htl_value = (*htl).min(max_htl); + if htl_value == 0 { + tracing::warn!( + tx = %id, + %contract, + sender = %sender.peer, + "Discarding PutForward with zero HTL" + ); + return Ok(OperationResult { + return_msg: None, + state: None, + }); + } // Get contract key and own location let key = contract.key(); let peer_loc = op_manager.ring.connection_manager.own_location(); @@ -717,7 +742,7 @@ impl Operation for PutOp { }; // Determine if this is the last hop and handle forwarding - let last_hop = if let Some(new_htl) = htl.checked_sub(1) { + let last_hop = if let Some(new_htl) = htl_value.checked_sub(1) { // Create updated skip list let mut new_skip_list = skip_list.clone(); new_skip_list.insert(sender.peer.clone()); @@ -1269,18 +1294,29 @@ where { let key = contract.key(); let contract_loc = Location::from(&key); + let max_htl = op_manager.ring.max_hops_to_live.max(1); + let capped_htl = htl.min(max_htl); + if capped_htl == 0 { + tracing::warn!( + tx = %id, + %key, + skip = ?skip_list, + "Discarding PutForward with zero HTL after sanitization" + ); + return true; + } let target_peer = op_manager .ring .closest_potentially_caching(&key, &skip_list); let own_pkloc = op_manager.ring.connection_manager.own_location(); let own_loc = own_pkloc.location.expect("infallible"); - tracing::debug!( + tracing::info!( tx = %id, %key, contract_location = %contract_loc.0, own_location = %own_loc.0, - skip_list_size = skip_list.len(), + skip_list = ?skip_list, "Evaluating PUT forwarding decision" ); @@ -1289,19 +1325,41 @@ where let other_distance = contract_loc.distance(other_loc); let self_distance = contract_loc.distance(own_loc); - tracing::debug!( + tracing::info!( tx = %id, %key, target_peer = %peer.peer, target_location = %other_loc.0, target_distance = ?other_distance, self_distance = ?self_distance, + skip_list = ?skip_list, "Found potential forward target" ); + if peer.peer == own_pkloc.peer { + tracing::info!( + tx = %id, + %key, + skip_list = ?skip_list, + "Not forwarding - candidate peer resolves to self" + ); + return true; + } + + if htl == 0 { + tracing::info!( + tx = %id, + %key, + target_peer = %peer.peer, + "HTL exhausted - storing locally" + ); + return true; + } + + let mut updated_skip_list = skip_list.clone(); + updated_skip_list.insert(own_pkloc.peer.clone()); + if other_distance < self_distance { - // forward the contract towards this node since it is indeed closer to the contract location - // and forget about it, no need to keep track of this op or wait for response tracing::info!( tx = %id, %key, @@ -1310,36 +1368,44 @@ where contract_location = %contract_loc.0, from_location = %own_loc.0, to_location = %other_loc.0, + skip_list = ?updated_skip_list, "Forwarding PUT to closer peer" ); - - let _ = conn_manager - .send( - &peer.peer, - (PutMsg::PutForward { - id, - sender: own_pkloc, - target: peer.clone(), - contract: contract.clone(), - new_value: new_value.clone(), - htl, - skip_list, - }) - .into(), - ) - .await; - return false; } else { - tracing::debug!( + tracing::info!( tx = %id, %key, - "Not forwarding - this peer is closest" + from_peer = %own_pkloc.peer, + to_peer = %peer.peer, + contract_location = %contract_loc.0, + from_location = %own_loc.0, + to_location = %other_loc.0, + skip_list = ?updated_skip_list, + "Forwarding PUT to peer despite non-improving distance (avoiding local minimum)" ); } + + let _ = conn_manager + .send( + &peer.peer, + (PutMsg::PutForward { + id, + sender: own_pkloc, + target: peer.clone(), + contract: contract.clone(), + new_value: new_value.clone(), + htl: capped_htl, + skip_list: updated_skip_list, + }) + .into(), + ) + .await; + return false; } else { - tracing::debug!( + tracing::info!( tx = %id, %key, + skip_list = ?skip_list, "No peers available for forwarding - caching locally" ); } diff --git a/crates/core/src/operations/subscribe.rs b/crates/core/src/operations/subscribe.rs index 9963fc8bf..c3047ae1d 100644 --- a/crates/core/src/operations/subscribe.rs +++ b/crates/core/src/operations/subscribe.rs @@ -3,7 +3,7 @@ use std::future::Future; use std::pin::Pin; pub(crate) use self::messages::SubscribeMsg; -use super::{OpEnum, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; +use super::{get, OpEnum, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; use crate::node::IsOperationCompleted; use crate::{ client_events::HostResult, @@ -19,6 +19,18 @@ use serde::{Deserialize, Serialize}; const MAX_RETRIES: usize = 10; +fn subscribers_snapshot(op_manager: &OpManager, key: &ContractKey) -> Vec { + op_manager + .ring + .subscribers_of(key) + .map(|subs| { + subs.iter() + .map(|loc| format!("{:.8}", loc.peer)) + .collect::>() + }) + .unwrap_or_default() +} + #[derive(Debug)] enum SubscribeState { /// Prepare the request to subscribe. @@ -72,57 +84,79 @@ pub(crate) async fn request_subscribe( sub_op: SubscribeOp, ) -> Result<(), OpError> { if let Some(SubscribeState::PrepareRequest { id, key }) = &sub_op.state { + let own_loc = op_manager.ring.connection_manager.own_location(); + let local_has_contract = super::has_contract(op_manager, *key).await?; + + tracing::debug!( + tx = %id, + %key, + subscriber_peer = %own_loc.peer, + local_has_contract, + "subscribe: request_subscribe invoked" + ); + + let mut skip_list: HashSet = HashSet::new(); + skip_list.insert(own_loc.peer.clone()); + // Use k_closest_potentially_caching to try multiple candidates - const EMPTY: &[PeerId] = &[]; // Try up to 3 candidates - let candidates = op_manager.ring.k_closest_potentially_caching(key, EMPTY, 3); + let candidates = op_manager + .ring + .k_closest_potentially_caching(key, &skip_list, 3); + + if tracing::enabled!(tracing::Level::INFO) { + let skip_display: Vec = skip_list + .iter() + .map(|peer| format!("{:.8}", peer)) + .collect(); + let candidate_display: Vec = candidates + .iter() + .map(|cand| format!("{:.8}", cand.peer)) + .collect(); + tracing::info!( + tx = %id, + %key, + skip = ?skip_display, + candidates = ?candidate_display, + "subscribe: k_closest_potentially_caching results" + ); + } let target = match candidates.first() { Some(peer) => peer.clone(), None => { - // No remote peers available - check if we have the contract locally - tracing::debug!(%key, "No remote peers available for subscription, checking locally"); - - if super::has_contract(op_manager, *key).await? { - // We have the contract locally - register subscription and complete immediately - tracing::info!(%key, tx = %id, "Contract available locally, registering local subscription"); - - // CRITICAL FIX for issue #2001: Register subscriber in DashMap before completing - // Without this, UPDATE operations won't find subscribers for locally-cached contracts - let subscriber = op_manager.ring.connection_manager.own_location(); - if op_manager + // No remote peers available - rely on local contract if present. + tracing::debug!( + %key, + "No remote peers available for subscription, checking locally" + ); + + if local_has_contract { + tracing::info!( + %key, + tx = %id, + "No remote peers, fulfilling subscription locally" + ); + return complete_local_subscription(op_manager, *id, *key).await; + } else { + let connection_count = op_manager.ring.connection_manager.num_connections(); + let subscribers = op_manager .ring - .add_subscriber(key, subscriber.clone()) - .is_err() - { - tracing::error!(%key, tx = %id, "Failed to add local subscriber - max subscribers reached"); - // Continue anyway - client requested subscription and contract is local - } else { - tracing::debug!(%key, tx = %id, subscriber = %subscriber.peer, "Successfully registered local subscriber"); - } - - match op_manager - .notify_node_event(crate::message::NodeEvent::LocalSubscribeComplete { - tx: *id, - key: *key, - subscribed: true, + .subscribers_of(key) + .map(|subs| { + subs.value() + .iter() + .map(|loc| format!("{:.8}", loc.peer)) + .collect::>() }) - .await - { - Ok(()) => { - tracing::debug!(%key, tx = %id, "sent LocalSubscribeComplete event") - } - Err(e) => { - tracing::error!(%key, tx = %id, error = %e, "failed to send LocalSubscribeComplete event") - } - } - - // Mark subscription as completed for atomicity tracking - op_manager.completed(*id); - - return Ok(()); - } else { - tracing::debug!(%key, "Contract not available locally and no remote peers"); + .unwrap_or_default(); + tracing::warn!( + %key, + tx = %id, + connection_count, + subscribers = ?subscribers, + "Contract not available locally and no remote peers" + ); return Err(RingError::NoCachingPeers(*key).into()); } } @@ -130,15 +164,23 @@ pub(crate) async fn request_subscribe( // Forward to remote peer let new_state = Some(SubscribeState::AwaitingResponse { - skip_list: vec![].into_iter().collect(), + skip_list, retries: 0, current_hop: op_manager.ring.max_hops_to_live, upstream_subscriber: None, }); + tracing::debug!( + tx = %id, + %key, + target_peer = %target.peer, + target_location = ?target.location, + "subscribe: forwarding RequestSub to target peer" + ); let msg = SubscribeMsg::RequestSub { id: *id, key: *key, target, + subscriber: own_loc.clone(), }; let op = SubscribeOp { id: *id, @@ -154,6 +196,38 @@ pub(crate) async fn request_subscribe( Ok(()) } +async fn complete_local_subscription( + op_manager: &OpManager, + id: Transaction, + key: ContractKey, +) -> Result<(), OpError> { + let subscriber = op_manager.ring.connection_manager.own_location(); + if let Err(err) = op_manager.ring.add_subscriber(&key, subscriber.clone()) { + tracing::warn!( + %key, + tx = %id, + subscriber = %subscriber.peer, + error = ?err, + "Failed to register local subscriber" + ); + } else { + tracing::debug!( + %key, + tx = %id, + subscriber = %subscriber.peer, + "Registered local subscriber" + ); + } + + op_manager + .notify_node_event(crate::message::NodeEvent::LocalSubscribeComplete { + tx: id, + key, + subscribed: true, + }) + .await +} + pub(crate) struct SubscribeOp { pub id: Transaction, state: Option, @@ -240,21 +314,138 @@ impl Operation for SubscribeOp { let new_state; match input { - SubscribeMsg::RequestSub { id, key, target } => { - // fast tracked from the request_sub func - debug_assert!(matches!( + SubscribeMsg::RequestSub { + id, + key, + target: _, + subscriber, + } => { + tracing::debug!( + tx = %id, + %key, + subscriber = %subscriber.peer, + "subscribe: processing RequestSub" + ); + let own_loc = op_manager.ring.connection_manager.own_location(); + + if !matches!( self.state, Some(SubscribeState::AwaitingResponse { .. }) - )); - let sender = op_manager.ring.connection_manager.own_location(); + | Some(SubscribeState::ReceivedRequest) + ) { + tracing::warn!( + tx = %id, + %key, + state = ?self.state, + "subscribe: RequestSub received in unexpected state" + ); + return Err(OpError::invalid_transition(self.id)); + } + + if super::has_contract(op_manager, *key).await? { + let before_direct = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + subscriber = %subscriber.peer, + subscribers_before = ?before_direct, + "subscribe: handling RequestSub locally (contract available)" + ); + + if op_manager + .ring + .add_subscriber(key, subscriber.clone()) + .is_err() + { + tracing::warn!( + tx = %id, + %key, + subscriber = %subscriber.peer, + subscribers_before = ?before_direct, + "subscribe: direct registration failed (max subscribers reached)" + ); + return Ok(OperationResult { + return_msg: Some(NetMessage::from(SubscribeMsg::ReturnSub { + id: *id, + key: *key, + sender: own_loc.clone(), + target: subscriber.clone(), + subscribed: false, + })), + state: None, + }); + } + + let after_direct = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + subscriber = %subscriber.peer, + subscribers_after = ?after_direct, + "subscribe: registered direct subscriber (RequestSub)" + ); + + if subscriber.peer == own_loc.peer { + tracing::debug!( + tx = %id, + %key, + "RequestSub originated locally; sending LocalSubscribeComplete" + ); + if let Err(err) = op_manager + .notify_node_event( + crate::message::NodeEvent::LocalSubscribeComplete { + tx: *id, + key: *key, + subscribed: true, + }, + ) + .await + { + tracing::error!( + tx = %id, + %key, + error = %err, + "Failed to send LocalSubscribeComplete event for RequestSub" + ); + return Err(err); + } + + return build_op_result(self.id, None, None); + } + + let return_msg = SubscribeMsg::ReturnSub { + id: *id, + key: *key, + sender: own_loc.clone(), + target: subscriber.clone(), + subscribed: true, + }; + + return build_op_result(self.id, None, Some(return_msg)); + } + + let mut skip = HashSet::new(); + skip.insert(subscriber.peer.clone()); + skip.insert(own_loc.peer.clone()); + + let forward_target = op_manager + .ring + .k_closest_potentially_caching(key, &skip, 3) + .into_iter() + .find(|candidate| candidate.peer != own_loc.peer) + .ok_or_else(|| RingError::NoCachingPeers(*key)) + .map_err(OpError::from)?; + + skip.insert(forward_target.peer.clone()); + new_state = self.state; return_msg = Some(SubscribeMsg::SeekNode { id: *id, key: *key, - target: target.clone(), - subscriber: sender.clone(), - skip_list: HashSet::from([sender.peer]), - htl: op_manager.ring.max_hops_to_live, + target: forward_target, + subscriber: subscriber.clone(), + skip_list: skip.clone(), + htl: op_manager.ring.max_hops_to_live.max(1), retries: 0, }); } @@ -267,6 +458,8 @@ impl Operation for SubscribeOp { htl, retries, } => { + let ring_max_htl = op_manager.ring.max_hops_to_live.max(1); + let htl = (*htl).min(ring_max_htl); let this_peer = op_manager.ring.connection_manager.own_location(); let return_not_subbed = || -> OperationResult { OperationResult { @@ -281,6 +474,16 @@ impl Operation for SubscribeOp { } }; + if htl == 0 { + tracing::warn!( + tx = %id, + %key, + subscriber = %subscriber.peer, + "Dropping Subscribe SeekNode with zero HTL" + ); + return Ok(return_not_subbed()); + } + if !super::has_contract(op_manager, *key).await? { tracing::debug!(tx = %id, %key, "Contract not found, trying other peer"); @@ -288,53 +491,127 @@ impl Operation for SubscribeOp { let candidates = op_manager .ring .k_closest_potentially_caching(key, skip_list, 3); - let Some(new_target) = candidates.first() else { - tracing::warn!(tx = %id, %key, "No remote peer available for forwarding"); - return Ok(return_not_subbed()); - }; - let new_target = new_target.clone(); - let new_htl = htl - 1; + if candidates.is_empty() { + let connection_count = + op_manager.ring.connection_manager.num_connections(); + tracing::warn!( + tx = %id, + %key, + skip = ?skip_list, + connection_count, + "No remote peer available for forwarding" + ); + tracing::info!( + tx = %id, + %key, + "Attempting to fetch contract locally before aborting subscribe" + ); - if new_htl == 0 { - tracing::debug!(tx = %id, %key, "Max number of hops reached while trying to get contract"); - return Ok(return_not_subbed()); - } + let get_op = get::start_op(*key, true, false); + if let Err(fetch_err) = + get::request_get(op_manager, get_op, HashSet::new()).await + { + tracing::warn!( + tx = %id, + %key, + error = %fetch_err, + "Failed to fetch contract locally while handling subscribe" + ); + return Ok(return_not_subbed()); + } - let mut new_skip_list = skip_list.clone(); - new_skip_list.insert(target.peer.clone()); - - tracing::debug!(tx = %id, new_target = %new_target.peer, "Forward request to peer"); - // Retry seek node when the contract to subscribe has not been found in this node - return build_op_result( - *id, - Some(SubscribeState::AwaitingResponse { - skip_list: new_skip_list.clone(), - retries: *retries, - current_hop: new_htl, - upstream_subscriber: Some(subscriber.clone()), - }), - (SubscribeMsg::SeekNode { - id: *id, - key: *key, - subscriber: this_peer, - target: new_target, - skip_list: new_skip_list, - htl: new_htl, - retries: *retries, - }) - .into(), - ); + if !super::has_contract(op_manager, *key).await? { + tracing::warn!( + tx = %id, + %key, + "Contract still unavailable locally after fetch attempt" + ); + return Ok(return_not_subbed()); + } + } else { + let Some(new_target) = candidates.first() else { + return Ok(return_not_subbed()); + }; + let new_target = new_target.clone(); + let new_htl = htl.saturating_sub(1); + + if new_htl == 0 { + tracing::debug!(tx = %id, %key, "Max number of hops reached while trying to get contract"); + return Ok(return_not_subbed()); + } + + let mut new_skip_list = skip_list.clone(); + new_skip_list.insert(target.peer.clone()); + + tracing::info!( + tx = %id, + %key, + new_target = %new_target.peer, + upstream = %subscriber.peer, + "Forward request to peer" + ); + tracing::debug!( + tx = %id, + %key, + candidates = ?candidates, + skip = ?new_skip_list, + "Forwarding seek to next candidate" + ); + // Retry seek node when the contract to subscribe has not been found in this node + return build_op_result( + *id, + Some(SubscribeState::AwaitingResponse { + skip_list: new_skip_list.clone(), + retries: *retries, + current_hop: new_htl, + upstream_subscriber: Some(subscriber.clone()), + }), + (SubscribeMsg::SeekNode { + id: *id, + key: *key, + subscriber: this_peer, + target: new_target, + skip_list: new_skip_list, + htl: new_htl, + retries: *retries, + }) + .into(), + ); + } + // After fetch attempt we should now have the contract locally. } + let before_direct = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + subscriber = %subscriber.peer, + subscribers_before = ?before_direct, + "subscribe: attempting to register direct subscriber" + ); if op_manager .ring .add_subscriber(key, subscriber.clone()) .is_err() { - tracing::debug!(tx = %id, %key, "Max number of subscribers reached for contract"); + tracing::warn!( + tx = %id, + %key, + subscriber = %subscriber.peer, + subscribers_before = ?before_direct, + "subscribe: direct registration failed (max subscribers reached)" + ); // max number of subscribers for this contract reached return Ok(return_not_subbed()); } + let after_direct = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + subscriber = %subscriber.peer, + subscribers_after = ?after_direct, + "subscribe: registered direct subscriber" + ); match self.state { Some(SubscribeState::ReceivedRequest) => { @@ -433,6 +710,56 @@ impl Operation for SubscribeOp { provider = %sender.peer, "Subscribed to contract" ); + tracing::info!( + tx = %id, + %key, + upstream = upstream_subscriber + .as_ref() + .map(|loc| format!("{:.8}", loc.peer)) + .unwrap_or_else(|| "".into()), + "Handling ReturnSub (subscribed=true)" + ); + if let Some(upstream_subscriber) = upstream_subscriber.as_ref() { + let before_upstream = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + upstream = %upstream_subscriber.peer, + subscribers_before = ?before_upstream, + "subscribe: attempting to register upstream link" + ); + if op_manager + .ring + .add_subscriber(key, upstream_subscriber.clone()) + .is_err() + { + tracing::warn!( + tx = %id, + %key, + upstream = %upstream_subscriber.peer, + subscribers_before = ?before_upstream, + "subscribe: upstream registration failed (max subscribers reached)" + ); + } else { + let after_upstream = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + upstream = %upstream_subscriber.peer, + subscribers_after = ?after_upstream, + "subscribe: registered upstream link" + ); + } + } + + let before_provider = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + provider = %sender.peer, + subscribers_before = ?before_provider, + "subscribe: registering provider/subscription source" + ); if op_manager.ring.add_subscriber(key, sender.clone()).is_err() { // concurrently it reached max number of subscribers for this contract tracing::debug!( @@ -442,6 +769,14 @@ impl Operation for SubscribeOp { ); return Err(OpError::UnexpectedOpState); } + let after_provider = subscribers_snapshot(op_manager, key); + tracing::info!( + tx = %id, + %key, + provider = %sender.peer, + subscribers_after = ?after_provider, + "subscribe: registered provider/subscription source" + ); new_state = Some(SubscribeState::Completed { key: *key }); if let Some(upstream_subscriber) = upstream_subscriber { @@ -518,6 +853,7 @@ mod messages { id: Transaction, key: ContractKey, target: PeerKeyLocation, + subscriber: PeerKeyLocation, }, SeekNode { id: Transaction, @@ -549,6 +885,7 @@ mod messages { fn target(&self) -> Option> { match self { + Self::RequestSub { target, .. } => Some(target), Self::SeekNode { target, .. } => Some(target), Self::ReturnSub { target, .. } => Some(target), _ => None, diff --git a/crates/core/src/operations/subscribe/tests.rs b/crates/core/src/operations/subscribe/tests.rs index 8b1d763c1..af8c3dfad 100644 --- a/crates/core/src/operations/subscribe/tests.rs +++ b/crates/core/src/operations/subscribe/tests.rs @@ -13,13 +13,15 @@ use std::collections::HashSet; struct TestRing { pub k_closest_calls: std::sync::Arc, usize)>>>, pub candidates: Vec, + pub own_peer: PeerId, } impl TestRing { - fn new(candidates: Vec, _own_location: PeerKeyLocation) -> Self { + fn new(candidates: Vec, own_location: PeerKeyLocation) -> Self { Self { k_closest_calls: std::sync::Arc::new(tokio::sync::Mutex::new(Vec::new())), candidates, + own_peer: own_location.peer, } } @@ -30,12 +32,18 @@ impl TestRing { k: usize, ) -> Vec { // Record the call - use async lock - let skip_vec: Vec = self + let mut skip_vec: Vec = self .candidates .iter() .filter(|peer| skip_list.has_element(peer.peer.clone())) .map(|peer| peer.peer.clone()) .collect(); + if skip_list.has_element(self.own_peer.clone()) + // avoid duplicates if own peer also in candidates + && !skip_vec.iter().any(|p| p == &self.own_peer) + { + skip_vec.push(self.own_peer.clone()); + } // Use async lock self.k_closest_calls.lock().await.push((*key, skip_vec, k)); @@ -87,10 +95,11 @@ async fn test_subscription_routing_calls_k_closest_with_skip_list() { Some(SubscribeState::PrepareRequest { .. }) )); - // 2. Test k_closest_potentially_caching with empty skip list (simulates request_subscribe call) - const EMPTY: &[PeerId] = &[]; + // 2. Test k_closest_potentially_caching with initial skip list containing self (simulates request_subscribe call) + let mut initial_skip = HashSet::new(); + initial_skip.insert(own_location.peer.clone()); let initial_candidates = test_ring - .k_closest_potentially_caching(&contract_key, EMPTY, 3) + .k_closest_potentially_caching(&contract_key, &initial_skip, 3) .await; // 3. Verify initial call was recorded @@ -106,8 +115,12 @@ async fn test_subscription_routing_calls_k_closest_with_skip_list() { ); assert_eq!( k_closest_calls[0].1.len(), - 0, - "Initial call should have empty skip list" + 1, + "Initial call should only skip own peer" + ); + assert_eq!( + k_closest_calls[0].1[0], own_location.peer, + "Initial skip list should contain own peer" ); assert_eq!(k_closest_calls[0].2, 3, "Should request 3 candidates"); drop(k_closest_calls); @@ -206,7 +219,7 @@ async fn test_subscription_routing_calls_k_closest_with_skip_list() { // This test validates the TestRing behavior that supports subscription routing: // 1. start_op always works (no early return bug) - // 2. k_closest_potentially_caching is called with empty skip list initially + // 2. k_closest_potentially_caching is called with a skip list that already excludes the local peer // 3. k_closest_potentially_caching is called with proper skip list after failures // 4. Skip list correctly excludes failed peers // 5. Alternative peers are found after failures @@ -254,10 +267,11 @@ async fn test_subscription_production_code_paths_use_k_closest() { )); // Test 2: Simulate the k_closest_potentially_caching call made in request_subscribe - // (Line 72 in subscribe.rs: op_manager.ring.k_closest_potentially_caching(key, EMPTY, 3)) - const EMPTY: &[PeerId] = &[]; + // (Line 72 in subscribe.rs: op_manager.ring.k_closest_potentially_caching(key, skip_list, 3)) + let mut initial_skip = HashSet::new(); + initial_skip.insert(own_location.peer.clone()); let initial_candidates = test_ring - .k_closest_potentially_caching(&contract_key, EMPTY, 3) + .k_closest_potentially_caching(&contract_key, &initial_skip, 3) .await; // Verify the call was recorded (this proves our test setup works) @@ -273,8 +287,12 @@ async fn test_subscription_production_code_paths_use_k_closest() { ); assert_eq!( k_closest_calls[0].1.len(), - 0, - "Should use empty skip list initially" + 1, + "Should skip own peer initially" + ); + assert_eq!( + k_closest_calls[0].1[0], own_location.peer, + "Skip list should contain own peer" ); assert_eq!(k_closest_calls[0].2, 3, "Should request 3 candidates"); drop(k_closest_calls); @@ -388,7 +406,7 @@ async fn test_subscription_production_code_paths_use_k_closest() { #[tokio::test] async fn test_subscription_validates_k_closest_usage() { // This test validates that the subscription operation correctly: - // 1. Calls k_closest_potentially_caching with an empty skip list on first attempt + // 1. Calls k_closest_potentially_caching with a skip list containing the local peer on first attempt // 2. Accumulates failed peers in the skip list // 3. Calls k_closest_potentially_caching with the skip list on retry @@ -419,16 +437,25 @@ async fn test_subscription_validates_k_closest_usage() { // Test 1: Validate the exact call pattern from request_subscribe (line 72) { - const EMPTY: &[PeerId] = &[]; + let mut initial_skip = HashSet::new(); + initial_skip.insert(test_ring.own_peer.clone()); let _candidates = test_ring - .k_closest_potentially_caching(&contract_key, EMPTY, 3) + .k_closest_potentially_caching(&contract_key, &initial_skip, 3) .await; let calls = test_ring.k_closest_calls.lock().await; assert_eq!(calls.len(), 1, "Should record the call"); let (key, skip_list, k) = &calls[0]; assert_eq!(*key, contract_key); - assert!(skip_list.is_empty(), "First attempt has empty skip list"); + assert_eq!( + skip_list.len(), + 1, + "First attempt should only skip own peer" + ); + assert_eq!( + skip_list[0], test_ring.own_peer, + "Skip list should contain own peer" + ); assert_eq!(*k, 3, "Uses k=3 as per fix"); } diff --git a/crates/core/src/operations/update.rs b/crates/core/src/operations/update.rs index 4b21ccc72..aa51b2244 100644 --- a/crates/core/src/operations/update.rs +++ b/crates/core/src/operations/update.rs @@ -4,7 +4,7 @@ use freenet_stdlib::client_api::{ErrorKind, HostResponse}; use freenet_stdlib::prelude::*; pub(crate) use self::messages::UpdateMsg; -use super::{OpEnum, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; +use super::{get, OpEnum, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; use crate::contract::{ContractHandlerEvent, StoreResponse}; use crate::message::{InnerMessage, NetMessage, NodeEvent, Transaction}; use crate::node::IsOperationCompleted; @@ -13,6 +13,7 @@ use crate::{ client_events::HostResult, node::{NetworkBridge, OpManager, PeerId}, }; +use std::collections::HashSet; pub(crate) struct UpdateOp { pub id: Transaction, @@ -248,8 +249,13 @@ impl Operation for UpdateOp { return_msg = None; } else { // Get broadcast targets for propagating UPDATE to subscribers - let broadcast_to = op_manager - .get_broadcast_targets_update(key, &request_sender.peer); + let mut broadcast_to = + op_manager.get_broadcast_targets_update(key, &request_sender.peer); + + if broadcast_to.is_empty() { + broadcast_to = op_manager + .compute_update_fallback_targets(key, &request_sender.peer); + } if broadcast_to.is_empty() { tracing::debug!( @@ -258,16 +264,17 @@ impl Operation for UpdateOp { "No broadcast targets, completing UPDATE locally" ); - if upstream.is_none() { - new_state = Some(UpdateState::Finished { - key: *key, - summary: summary.clone(), - }); - } else { - new_state = None; - } + let raw_state = State::from(updated_value.clone()); + let summary = StateSummary::from(raw_state.into_bytes()); - return_msg = None; + return_msg = Some(UpdateMsg::SuccessfulUpdate { + id: *id, + target: request_sender.clone(), + summary: summary.clone(), + key: *key, + sender: self_location.clone(), + }); + new_state = Some(UpdateState::Finished { key: *key, summary }); } else { // Broadcast to other peers match try_to_broadcast( @@ -292,10 +299,21 @@ impl Operation for UpdateOp { } } else { // Contract not found locally - forward to another peer - let next_target = op_manager.ring.closest_potentially_caching( - key, - [&self_location.peer, &request_sender.peer].as_slice(), - ); + let skip_peers = [&self_location.peer, &request_sender.peer]; + let next_target = op_manager + .ring + .closest_potentially_caching(key, skip_peers.as_slice()) + .or_else(|| { + op_manager + .ring + .k_closest_potentially_caching( + key, + skip_peers.as_slice(), + 5, + ) + .into_iter() + .next() + }); if let Some(forward_target) = next_target { tracing::debug!( @@ -316,10 +334,33 @@ impl Operation for UpdateOp { }); new_state = None; } else { + let skip_list = [&self_location.peer, &request_sender.peer]; + let subscribers = op_manager + .ring + .subscribers_of(key) + .map(|subs| { + subs.value() + .iter() + .map(|loc| format!("{:.8}", loc.peer)) + .collect::>() + }) + .unwrap_or_default(); + let candidates = op_manager + .ring + .k_closest_potentially_caching(key, skip_list.as_slice(), 5) + .into_iter() + .map(|loc| format!("{:.8}", loc.peer)) + .collect::>(); + let connection_count = + op_manager.ring.connection_manager.num_connections(); // No peers available and we don't have the contract - error tracing::error!( tx = %id, %key, + subscribers = ?subscribers, + candidates = ?candidates, + connection_count, + request_sender = %request_sender.peer, "Cannot handle UPDATE: contract not found locally and no peers to forward to" ); return Err(OpError::RingError(RingError::NoCachingPeers(*key))); @@ -386,18 +427,30 @@ impl Operation for UpdateOp { return_msg = None; } else { // Get broadcast targets - let broadcast_to = + let mut broadcast_to = op_manager.get_broadcast_targets_update(key, &sender.peer); - // If no peers to broadcast to, nothing else to do + if broadcast_to.is_empty() { + broadcast_to = + op_manager.compute_update_fallback_targets(key, &sender.peer); + } + if broadcast_to.is_empty() { tracing::debug!( tx = %id, %key, - "No broadcast targets for SeekNode - completing locally" + "No broadcast targets for SeekNode - completing with SuccessfulUpdate" ); + let raw_state = State::from(updated_value.clone()); + let summary = StateSummary::from(raw_state.into_bytes()); + return_msg = Some(UpdateMsg::SuccessfulUpdate { + id: *id, + target: sender.clone(), + summary, + key: *key, + sender: op_manager.ring.connection_manager.own_location(), + }); new_state = None; - return_msg = None; } else { // Have peers to broadcast to - use try_to_broadcast match try_to_broadcast( @@ -447,13 +500,90 @@ impl Operation for UpdateOp { }); new_state = None; } else { - // No more peers to try - error - tracing::error!( + tracing::warn!( tx = %id, %key, - "Cannot handle UPDATE SeekNode: contract not found and no peers to forward to" + "No forwarding targets for UPDATE SeekNode - attempting local fetch" ); - return Err(OpError::RingError(RingError::NoCachingPeers(*key))); + + let mut fetch_skip = HashSet::new(); + fetch_skip.insert(sender.peer.clone()); + + let get_op = get::start_op(*key, true, false); + if let Err(fetch_err) = + get::request_get(op_manager, get_op, fetch_skip).await + { + tracing::warn!( + tx = %id, + %key, + error = %fetch_err, + "Failed to fetch contract while handling UPDATE SeekNode" + ); + return Err(OpError::RingError(RingError::NoCachingPeers(*key))); + } + + if super::has_contract(op_manager, *key).await? { + tracing::info!( + tx = %id, + %key, + "Successfully fetched contract locally, applying UPDATE" + ); + let updated_value = update_contract( + op_manager, + *key, + value.clone(), + related_contracts.clone(), + ) + .await?; + + let mut broadcast_to = + op_manager.get_broadcast_targets_update(key, &sender.peer); + + if broadcast_to.is_empty() { + broadcast_to = op_manager + .compute_update_fallback_targets(key, &sender.peer); + } + + if broadcast_to.is_empty() { + let raw_state = State::from(updated_value); + let summary = StateSummary::from(raw_state.into_bytes()); + + return_msg = Some(UpdateMsg::SuccessfulUpdate { + id: *id, + target: sender.clone(), + summary, + key: *key, + sender: op_manager.ring.connection_manager.own_location(), + }); + new_state = None; + } else { + match try_to_broadcast( + *id, + true, + op_manager, + self.state, + (broadcast_to, sender.clone()), + *key, + value.clone(), + false, + ) + .await + { + Ok((state, msg)) => { + new_state = state; + return_msg = msg; + } + Err(err) => return Err(err), + } + } + } else { + tracing::error!( + tx = %id, + %key, + "Contract still unavailable after fetch attempt during UPDATE SeekNode" + ); + return Err(OpError::RingError(RingError::NoCachingPeers(*key))); + } } } } @@ -487,9 +617,14 @@ impl Operation for UpdateOp { new_state = None; return_msg = None; } else { - let broadcast_to = + let mut broadcast_to = op_manager.get_broadcast_targets_update(key, &sender.peer); + if broadcast_to.is_empty() { + broadcast_to = + op_manager.compute_update_fallback_targets(key, &sender.peer); + } + tracing::debug!( "Successfully updated a value for contract {} @ {:?} - BroadcastTo - update", key, @@ -649,9 +784,11 @@ impl OpManager { .ring .subscribers_of(key) .map(|subs| { + let self_peer = self.ring.connection_manager.get_peer_key(); subs.value() .iter() .filter(|pk| &pk.peer != sender) + .filter(|pk| self_peer.as_ref().map(|me| &pk.peer != me).unwrap_or(true)) .cloned() .collect::>() }) @@ -671,15 +808,60 @@ impl OpManager { subscribers.len() ); } else { + let own_peer = self.ring.connection_manager.get_peer_key(); + let skip_slice = std::slice::from_ref(sender); + let fallback_candidates = self + .ring + .k_closest_potentially_caching(key, skip_slice, 5) + .into_iter() + .map(|candidate| format!("{:.8}", candidate.peer)) + .collect::>(); + tracing::warn!( - "UPDATE_PROPAGATION: contract={:.8} from={} NO_TARGETS - update will not propagate", + "UPDATE_PROPAGATION: contract={:.8} from={} NO_TARGETS - update will not propagate (self={:?}, fallback_candidates={:?})", key, - sender + sender, + own_peer.map(|p| format!("{:.8}", p)), + fallback_candidates ); } subscribers } + + fn compute_update_fallback_targets( + &self, + key: &ContractKey, + sender: &PeerId, + ) -> Vec { + let mut skip: HashSet = HashSet::new(); + skip.insert(sender.clone()); + if let Some(self_peer) = self.ring.connection_manager.get_peer_key() { + skip.insert(self_peer); + } + + let candidates = self + .ring + .k_closest_potentially_caching(key, &skip, 3) + .into_iter() + .filter(|candidate| &candidate.peer != sender) + .collect::>(); + + if !candidates.is_empty() { + tracing::info!( + "UPDATE_PROPAGATION: contract={:.8} from={} using fallback targets={}", + key, + sender, + candidates + .iter() + .map(|c| format!("{:.8}", c.peer)) + .collect::>() + .join(",") + ); + } + + candidates + } } fn build_op_result( diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 8db58fcbb..2f326b978 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -1,5 +1,6 @@ use parking_lot::Mutex; use rand::prelude::IndexedRandom; +use std::collections::btree_map::Entry; use crate::topology::{Limits, TopologyManager}; @@ -16,6 +17,7 @@ pub(crate) struct ConnectionManager { /// Is important to keep track of this so no more connections are accepted prematurely. own_location: Arc, peer_key: Arc>>, + is_gateway: bool, pub min_connections: usize, pub max_connections: usize, pub rnd_if_htl_above: usize, @@ -44,6 +46,7 @@ impl ConnectionManager { Location::random().as_f64().to_le_bytes(), )), ), + false, ) } } @@ -102,6 +105,7 @@ impl ConnectionManager { config.peer_id.clone(), own_location, ), + config.is_gateway, ) } @@ -112,6 +116,7 @@ impl ConnectionManager { max_connections: usize, rnd_if_htl_above: usize, (pub_key, peer_id, own_location): (TransportPublicKey, Option, AtomicU64), + is_gateway: bool, ) -> Self { let topology_manager = Arc::new(RwLock::new(TopologyManager::new(Limits { max_upstream_bandwidth, @@ -128,6 +133,7 @@ impl ConnectionManager { topology_manager, own_location: own_location.into(), peer_key: Arc::new(Mutex::new(peer_id)), + is_gateway, min_connections, max_connections, rnd_if_htl_above, @@ -141,10 +147,31 @@ impl ConnectionManager { /// # Panic /// Will panic if the node checking for this condition has no location assigned. pub fn should_accept(&self, location: Location, peer_id: &PeerId) -> bool { - tracing::debug!("Checking if should accept connection"); + tracing::info!("Checking if should accept connection"); let open = self .open_connections .load(std::sync::atomic::Ordering::SeqCst); + let reserved_before = self + .reserved_connections + .load(std::sync::atomic::Ordering::SeqCst); + + tracing::info!( + %peer_id, + open, + reserved_before, + is_gateway = self.is_gateway, + "should_accept: evaluating direct acceptance guard" + ); + + if self.is_gateway && (open > 0 || reserved_before > 0) { + tracing::info!( + %peer_id, + open, + reserved_before, + "Gateway evaluating additional direct connection (post-bootstrap)" + ); + } + let total_conn = self .reserved_connections .fetch_add(1, std::sync::atomic::Ordering::SeqCst) @@ -155,6 +182,23 @@ impl ConnectionManager { return true; } + const GATEWAY_DIRECT_ACCEPT_LIMIT: usize = 2; + if self.is_gateway { + let direct_total = open + reserved_before; + if direct_total >= GATEWAY_DIRECT_ACCEPT_LIMIT { + tracing::info!( + %peer_id, + open, + reserved_before, + limit = GATEWAY_DIRECT_ACCEPT_LIMIT, + "Gateway reached direct-accept limit; forwarding join request instead" + ); + self.reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + return false; + } + } + if self.location_for_peer.read().get(peer_id).is_some() { // avoid connecting more than once to the same peer self.reserved_connections @@ -164,10 +208,10 @@ impl ConnectionManager { } let accepted = if total_conn < self.min_connections { - tracing::debug!(%peer_id, "Accepted connection, below min connections"); + tracing::info!(%peer_id, "Accepted connection, below min connections"); true } else if total_conn >= self.max_connections { - tracing::debug!(%peer_id, "Rejected connection, max connections reached"); + tracing::info!(%peer_id, "Rejected connection, max connections reached"); false } else { let accepted = self @@ -177,9 +221,9 @@ impl ConnectionManager { .unwrap_or(true); if accepted { - tracing::debug!(%peer_id, "Accepted connection, topology manager"); + tracing::info!(%peer_id, "Accepted connection, topology manager"); } else { - tracing::debug!(%peer_id, "Rejected connection, topology manager"); + tracing::info!(%peer_id, "Rejected connection, topology manager"); } accepted }; @@ -187,11 +231,39 @@ impl ConnectionManager { self.reserved_connections .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); } else { - tracing::debug!(%peer_id, "Accepted connection, reserving spot"); + tracing::info!(%peer_id, "Accepted connection, reserving spot"); + self.record_pending_location(peer_id, location); } accepted } + /// Record the advertised location for a peer that we have decided to accept. + /// + /// This makes the peer discoverable to the routing layer even before the connection + /// is fully established. The entry is removed automatically if the handshake fails + /// via `prune_in_transit_connection`. + pub fn record_pending_location(&self, peer_id: &PeerId, location: Location) { + let mut locations = self.location_for_peer.write(); + let entry = locations.entry(peer_id.clone()); + match entry { + Entry::Occupied(_) => { + tracing::info!( + %peer_id, + %location, + "record_pending_location: location already known" + ); + } + Entry::Vacant(v) => { + tracing::info!( + %peer_id, + %location, + "record_pending_location: registering advertised location for peer" + ); + v.insert(location); + } + } + } + /// Update this node location. pub fn update_location(&self, loc: Option) { if let Some(loc) = loc { @@ -251,7 +323,7 @@ impl ConnectionManager { } pub fn add_connection(&self, loc: Location, peer: PeerId, was_reserved: bool) { - tracing::debug!(%peer, "Adding connection"); + tracing::info!(%peer, %loc, %was_reserved, "Adding connection to topology"); debug_assert!(self.get_peer_key().expect("should be set") != peer); if was_reserved { let old = self @@ -332,6 +404,10 @@ impl ConnectionManager { self.connections_by_location.read().clone() } + pub(super) fn get_known_locations(&self) -> BTreeMap { + self.location_for_peer.read().clone() + } + /// Get a random peer from the known ring connections. pub fn random_peer(&self, filter_fn: F) -> Option where diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index 68212e507..6ee9330eb 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -22,7 +22,7 @@ use either::Either; use freenet_stdlib::prelude::ContractKey; use itertools::Itertools; use parking_lot::RwLock; -use rand::{prelude::IndexedRandom, Rng}; +use rand::Rng; use crate::message::TransactionType; use crate::topology::rate::Rate; @@ -270,14 +270,36 @@ impl Ring { let router = self.router.read(); let target_location = Location::from(contract_key); - // Get all connected peers through the connection manager (never includes self) + let mut candidates: BTreeMap = BTreeMap::new(); + let connections = self.connection_manager.get_connections_by_location(); - let peers = connections.values().filter_map(|conns| { - let conn = conns.choose(&mut rand::rng())?; - (!skip_list.has_element(conn.location.peer.clone())).then_some(&conn.location) - }); + for conns in connections.values() { + for conn in conns { + let peer = conn.location.peer.clone(); + if skip_list.has_element(peer.clone()) { + continue; + } + candidates + .entry(peer) + .or_insert_with(|| conn.location.clone()); + } + } + + let known_locations = self.connection_manager.get_known_locations(); + for (peer, location) in known_locations { + if skip_list.has_element(peer.clone()) { + continue; + } + candidates + .entry(peer.clone()) + .or_insert_with(|| PeerKeyLocation { + peer, + location: Some(location), + }); + } + + let peers = candidates.values(); - // Pass peers directly to select_k_best_peers since we never include self router .select_k_best_peers(peers, target_location, k) .into_iter() diff --git a/crates/core/src/ring/seeding.rs b/crates/core/src/ring/seeding.rs index 45b2d88b6..3474b542a 100644 --- a/crates/core/src/ring/seeding.rs +++ b/crates/core/src/ring/seeding.rs @@ -1,6 +1,7 @@ use super::{Location, PeerKeyLocation, Score}; use dashmap::{mapref::one::Ref as DmRef, DashMap}; use freenet_stdlib::prelude::ContractKey; +use tracing::{info, warn}; pub(crate) struct SeedingManager { /// The container for subscriber is a vec instead of something like a hashset @@ -110,18 +111,61 @@ impl SeedingManager { .subscribers .entry(*contract) .or_insert(Vec::with_capacity(Self::TOTAL_MAX_SUBSCRIPTIONS)); + let before = subs + .iter() + .map(|loc| format!("{:.8}", loc.peer)) + .collect::>(); + info!( + %contract, + subscriber = %subscriber.peer, + subscribers_before = ?before, + current_len = subs.len(), + "seeding_manager: attempting to add subscriber" + ); if subs.len() >= Self::MAX_SUBSCRIBERS { + warn!( + %contract, + subscriber = %subscriber.peer, + subscribers_before = ?before, + "seeding_manager: max subscribers reached" + ); return Err(()); } - if let Err(next_idx) = subs.value_mut().binary_search(&subscriber) { - let subs = subs.value_mut(); - if subs.len() == Self::MAX_SUBSCRIBERS { - return Err(()); - } else { - subs.insert(next_idx, subscriber); + let subs_vec = subs.value_mut(); + match subs_vec.binary_search(&subscriber) { + Ok(_) => { + info!( + %contract, + subscriber = %subscriber.peer, + subscribers_before = ?before, + "seeding_manager: subscriber already registered" + ); + Ok(()) + } + Err(next_idx) => { + if subs_vec.len() == Self::MAX_SUBSCRIBERS { + warn!( + %contract, + subscriber = %subscriber.peer, + subscribers_before = ?before, + "seeding_manager: max subscribers reached during insert" + ); + Err(()) + } else { + subs_vec.insert(next_idx, subscriber); + let after = subs_vec + .iter() + .map(|loc| format!("{:.8}", loc.peer)) + .collect::>(); + info!( + %contract, + subscribers_after = ?after, + "seeding_manager: subscriber added" + ); + Ok(()) + } } } - Ok(()) } pub fn subscribers_of( @@ -132,8 +176,15 @@ impl SeedingManager { } pub fn prune_subscriber(&self, loc: Location) { - self.subscribers.alter_all(|_, mut subs| { + self.subscribers.alter_all(|contract_key, mut subs| { if let Some(pos) = subs.iter().position(|l| l.location == Some(loc)) { + let removed = subs[pos].clone(); + tracing::debug!( + %contract_key, + removed_peer = %removed.peer, + removed_location = ?removed.location, + "seeding_manager: pruning subscriber due to location match" + ); subs.swap_remove(pos); } subs diff --git a/crates/core/src/router/mod.rs b/crates/core/src/router/mod.rs index ba459df22..39a5ea8c1 100644 --- a/crates/core/src/router/mod.rs +++ b/crates/core/src/router/mod.rs @@ -1,7 +1,7 @@ mod isotonic_estimator; mod util; -use crate::ring::{Location, PeerKeyLocation}; +use crate::ring::{Distance, Location, PeerKeyLocation}; use isotonic_estimator::{EstimatorType, IsotonicEstimator, IsotonicEvent}; use serde::{Deserialize, Serialize}; use std::time::Duration; @@ -162,9 +162,12 @@ impl Router { let mut peer_distances: Vec<_> = peers .into_iter() - .filter_map(|peer| { - peer.location - .map(|loc| (peer, target_location.distance(loc))) + .map(|peer| { + let distance = peer + .location + .map(|loc| target_location.distance(loc)) + .unwrap_or_else(|| Distance::new(0.5)); + (peer, distance) }) .collect(); @@ -202,9 +205,12 @@ impl Router { let mut peer_distances: Vec<_> = peers .into_iter() - .filter_map(|peer| { - peer.location - .map(|loc| (peer, target_location.distance(loc))) + .map(|peer| { + let distance = peer + .location + .map(|loc| target_location.distance(loc)) + .unwrap_or_else(|| Distance::new(0.5)); + (peer, distance) }) .collect(); diff --git a/crates/core/src/transport/connection_handler.rs b/crates/core/src/transport/connection_handler.rs index 5c1d5045c..231f502d4 100644 --- a/crates/core/src/transport/connection_handler.rs +++ b/crates/core/src/transport/connection_handler.rs @@ -1,9 +1,10 @@ use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::net::{IpAddr, SocketAddr}; use std::pin::Pin; use std::sync::atomic::AtomicU32; use std::sync::Arc; +use std::sync::Mutex; use std::time::{Duration, Instant}; use crate::config::PCK_VERSION; @@ -120,12 +121,16 @@ impl InboundConnectionHandler { #[derive(Clone)] pub(crate) struct OutboundConnectionHandler { send_queue: mpsc::Sender<(SocketAddr, ConnectionEvent)>, + expected_non_gateway: Arc>>, } #[cfg(test)] impl OutboundConnectionHandler { pub fn new(send_queue: mpsc::Sender<(SocketAddr, ConnectionEvent)>) -> Self { - OutboundConnectionHandler { send_queue } + OutboundConnectionHandler { + send_queue, + expected_non_gateway: Arc::new(Mutex::new(HashSet::new())), + } } } @@ -143,6 +148,8 @@ impl OutboundConnectionHandler { // Channel buffer is one so senders will await until the receiver is ready, important for bandwidth limiting let (outbound_sender, outbound_recv) = mpsc::channel(100); + let expected_non_gateway = Arc::new(Mutex::new(HashSet::new())); + let transport = UdpPacketsListener { is_gateway, socket_listener: socket.clone(), @@ -155,6 +162,7 @@ impl OutboundConnectionHandler { dropped_packets: HashMap::new(), last_drop_warning: Instant::now(), bandwidth_limit, + expected_non_gateway: expected_non_gateway.clone(), }; let bw_tracker = super::rate_limiter::PacketRateLimiter::new( DEFAULT_BW_TRACKER_WINDOW_SIZE, @@ -162,6 +170,7 @@ impl OutboundConnectionHandler { ); let connection_handler = OutboundConnectionHandler { send_queue: conn_handler_sender, + expected_non_gateway, }; // IMPORTANT: The general packet rate limiter is disabled (passing None) due to reliability issues. @@ -197,6 +206,9 @@ impl OutboundConnectionHandler { remote_public_key: TransportPublicKey, remote_addr: SocketAddr, ) -> Pin> + Send>> { + if let Ok(mut guard) = self.expected_non_gateway.lock() { + guard.insert(remote_addr.ip()); + } let (open_connection, recv_connection) = oneshot::channel(); if self .send_queue @@ -237,6 +249,7 @@ struct UdpPacketsListener { dropped_packets: HashMap, last_drop_warning: Instant, bandwidth_limit: Option, + expected_non_gateway: Arc>>, } type OngoingConnection = ( @@ -403,12 +416,19 @@ impl UdpPacketsListener { } if !self.is_gateway { - tracing::debug!( - %remote_addr, - %size, - "unexpected packet from non-gateway node" - ); - continue; + let allow = self + .expected_non_gateway + .lock() + .map(|set| set.contains(&remote_addr.ip())) + .unwrap_or(false); + if !allow { + tracing::debug!( + %remote_addr, + %size, + "unexpected packet from non-gateway node" + ); + continue; + } } // Check if we already have a gateway connection in progress @@ -477,6 +497,9 @@ impl UdpPacketsListener { match res.expect("task shouldn't panic") { Ok((outbound_remote_conn, inbound_remote_connection)) => { if let Some((_, result_sender)) = ongoing_connections.remove(&outbound_remote_conn.remote_addr) { + if let Ok(mut set) = self.expected_non_gateway.lock() { + set.remove(&outbound_remote_conn.remote_addr.ip()); + } tracing::debug!(remote_addr = %outbound_remote_conn.remote_addr, "connection established"); self.remote_connections.insert(outbound_remote_conn.remote_addr, inbound_remote_connection); let _ = result_sender.send(Ok(outbound_remote_conn)).map_err(|_| { @@ -498,6 +521,9 @@ impl UdpPacketsListener { } } if let Some((_, result_sender)) = ongoing_connections.remove(&remote_addr) { + if let Ok(mut set) = self.expected_non_gateway.lock() { + set.remove(&remote_addr.ip()); + } let _ = result_sender.send(Err(error)); } } @@ -541,8 +567,12 @@ impl UdpPacketsListener { } tracing::info!(%remote_addr, "attempting to establish connection"); let (ongoing_connection, packets_sender) = self.traverse_nat( - remote_addr, remote_public_key, + remote_addr, + remote_public_key.clone(), ); + if let Ok(mut set) = self.expected_non_gateway.lock() { + set.insert(remote_addr.ip()); + } let task = tokio::spawn(ongoing_connection .map_err(move |err| (err, remote_addr)) .instrument(span!(tracing::Level::DEBUG, "traverse_nat")) @@ -738,6 +768,7 @@ impl UdpPacketsListener { mpsc::channel::>(100); let this_addr = self.this_addr; let f = async move { + tracing::info!(%remote_addr, "Starting outbound handshake (NAT traversal)"); let mut state = ConnectionState::StartOutbound {}; // Initialize timeout and interval let mut timeout = INITIAL_TIMEOUT; @@ -840,6 +871,8 @@ impl UdpPacketsListener { .map_err(|_| TransportError::ChannelClosed)?; let (inbound_sender, inbound_recv) = mpsc::channel(100); tracing::debug!(%remote_addr, "connection established"); + let attempts = failures + 1; + tracing::info!(%remote_addr, attempts = attempts, "Outbound handshake completed (ack path)"); return Ok(( RemoteConnection { outbound_packets: outbound_packets.clone(), @@ -908,6 +941,8 @@ impl UdpPacketsListener { } // if is not an intro packet, the connection is successful and we can proceed let (inbound_sender, inbound_recv) = mpsc::channel(100); + let attempts = failures + 1; + tracing::info!(%remote_addr, attempts = attempts, "Outbound handshake completed (inbound ack path)"); return Ok(( RemoteConnection { outbound_packets: outbound_packets.clone(), @@ -970,6 +1005,7 @@ impl UdpPacketsListener { tick.tick().await; } + tracing::warn!(%remote_addr, attempts = failures, "Outbound handshake failed: max connection attempts reached"); Err(TransportError::ConnectionEstablishmentFailure { cause: "max connection attempts reached".into(), }) diff --git a/scripts/deploy-local-gateway.sh b/scripts/deploy-local-gateway.sh index a731dcdd9..3da4c8b30 100755 --- a/scripts/deploy-local-gateway.sh +++ b/scripts/deploy-local-gateway.sh @@ -249,7 +249,8 @@ start_service() { case "$SERVICE_MANAGER" in systemd) - if systemctl list-unit-files | grep -q "^$service_arg.service" 2>/dev/null; then + # Check if unit file exists by querying systemctl directly + if systemctl list-unit-files "$service_arg.service" 2>/dev/null | grep -q "$service_arg.service"; then echo -n " Starting systemd service ($service_arg)... " if [[ "$DRY_RUN" == "true" ]]; then echo "[DRY RUN]" @@ -294,7 +295,8 @@ verify_service() { case "$SERVICE_MANAGER" in systemd) - if systemctl list-unit-files | grep -q "^$service_arg.service" 2>/dev/null; then + # Check if unit file exists by querying systemctl directly + if systemctl list-unit-files "$service_arg.service" 2>/dev/null | grep -q "$service_arg.service"; then echo -n " Verifying service status ($service_arg)... " sleep 2 # Give service time to start if systemctl is-active --quiet "$service_arg.service"; then From b79a43e486263a59c62fb688f44b3a87efc5c2dc Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sun, 2 Nov 2025 22:50:50 +0100 Subject: [PATCH 03/26] build(deps): bump freenet-stdlib to 0.1.24 --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 89be38703..f8cd93d59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1881,9 +1881,9 @@ dependencies = [ [[package]] name = "freenet-stdlib" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66c64fa03f4a083918c7e347be47122c223d8156f4c012a0fe8e89a643350f2d" +checksum = "f39e2953b4b0d82dd02458653b57166ba8c967c6b3fcec146102a27e05a7081a" dependencies = [ "arbitrary", "bincode", diff --git a/Cargo.toml b/Cargo.toml index 53593615b..2306c4732 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ wasmer = "6.1.0" wasmer-compiler-singlepass = "6.1.0" # freenet-stdlib = { git = "https://github.com/freenet/freenet-stdlib.git", branch = "main", package = "freenet-stdlib" } -freenet-stdlib = { version = "0.1.23" } +freenet-stdlib = { version = "0.1.24" } # [patch.crates-io] # freenet-stdlib = { git = "https://github.com/freenet/freenet-stdlib.git", branch = "main" } From 48033bd8070f995fdd7acba0d7bc87198f1199ff Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sun, 2 Nov 2025 23:44:08 +0100 Subject: [PATCH 04/26] fix(update): align fallback with fire-and-forget semantics --- .../src/node/network_bridge/p2p_protoc.rs | 9 +- crates/core/src/operations/update.rs | 117 +++++++++--------- 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index b0db33ff4..a57f23b0c 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -841,13 +841,14 @@ impl P2pConnManager { .client_waiting_transaction .iter() .position(|(waiting, _)| match waiting { - WaitingTransaction::Subscription { contract_key } => { - contract_key == key.id() - } + WaitingTransaction::Subscription { + contract_key, + } => contract_key == key.id(), _ => false, }) { - let (_, clients) = state.client_waiting_transaction.remove(pos); + let (_, clients) = + state.client_waiting_transaction.remove(pos); tracing::debug!( "LocalSubscribeComplete for {} matched {} subscription waiters via contract {}", tx, diff --git a/crates/core/src/operations/update.rs b/crates/core/src/operations/update.rs index aa51b2244..b6ba487a4 100644 --- a/crates/core/src/operations/update.rs +++ b/crates/core/src/operations/update.rs @@ -249,8 +249,8 @@ impl Operation for UpdateOp { return_msg = None; } else { // Get broadcast targets for propagating UPDATE to subscribers - let mut broadcast_to = - op_manager.get_broadcast_targets_update(key, &request_sender.peer); + let mut broadcast_to = op_manager + .get_broadcast_targets_update(key, &request_sender.peer); if broadcast_to.is_empty() { broadcast_to = op_manager @@ -264,17 +264,16 @@ impl Operation for UpdateOp { "No broadcast targets, completing UPDATE locally" ); - let raw_state = State::from(updated_value.clone()); - let summary = StateSummary::from(raw_state.into_bytes()); + if upstream.is_none() { + new_state = Some(UpdateState::Finished { + key: *key, + summary: summary.clone(), + }); + } else { + new_state = None; + } - return_msg = Some(UpdateMsg::SuccessfulUpdate { - id: *id, - target: request_sender.clone(), - summary: summary.clone(), - key: *key, - sender: self_location.clone(), - }); - new_state = Some(UpdateState::Finished { key: *key, summary }); + return_msg = None; } else { // Broadcast to other peers match try_to_broadcast( @@ -439,18 +438,10 @@ impl Operation for UpdateOp { tracing::debug!( tx = %id, %key, - "No broadcast targets for SeekNode - completing with SuccessfulUpdate" + "No broadcast targets for SeekNode - completing locally" ); - let raw_state = State::from(updated_value.clone()); - let summary = StateSummary::from(raw_state.into_bytes()); - return_msg = Some(UpdateMsg::SuccessfulUpdate { - id: *id, - target: sender.clone(), - summary, - key: *key, - sender: op_manager.ring.connection_manager.own_location(), - }); new_state = None; + return_msg = None; } else { // Have peers to broadcast to - use try_to_broadcast match try_to_broadcast( @@ -528,7 +519,11 @@ impl Operation for UpdateOp { %key, "Successfully fetched contract locally, applying UPDATE" ); - let updated_value = update_contract( + let UpdateExecution { + value: updated_value, + summary: _summary, + changed, + } = update_contract( op_manager, *key, value.clone(), @@ -536,44 +531,50 @@ impl Operation for UpdateOp { ) .await?; - let mut broadcast_to = - op_manager.get_broadcast_targets_update(key, &sender.peer); - - if broadcast_to.is_empty() { - broadcast_to = op_manager - .compute_update_fallback_targets(key, &sender.peer); - } - - if broadcast_to.is_empty() { - let raw_state = State::from(updated_value); - let summary = StateSummary::from(raw_state.into_bytes()); - - return_msg = Some(UpdateMsg::SuccessfulUpdate { - id: *id, - target: sender.clone(), - summary, - key: *key, - sender: op_manager.ring.connection_manager.own_location(), - }); + if !changed { + tracing::debug!( + tx = %id, + %key, + "Fetched contract apply produced no change during SeekNode fallback" + ); new_state = None; + return_msg = None; } else { - match try_to_broadcast( - *id, - true, - op_manager, - self.state, - (broadcast_to, sender.clone()), - *key, - value.clone(), - false, - ) - .await - { - Ok((state, msg)) => { - new_state = state; - return_msg = msg; + let mut broadcast_to = + op_manager.get_broadcast_targets_update(key, &sender.peer); + + if broadcast_to.is_empty() { + broadcast_to = op_manager + .compute_update_fallback_targets(key, &sender.peer); + } + + if broadcast_to.is_empty() { + tracing::debug!( + tx = %id, + %key, + "No broadcast targets after SeekNode fallback apply; finishing locally" + ); + new_state = None; + return_msg = None; + } else { + match try_to_broadcast( + *id, + true, + op_manager, + self.state, + (broadcast_to, sender.clone()), + *key, + updated_value.clone(), + false, + ) + .await + { + Ok((state, msg)) => { + new_state = state; + return_msg = msg; + } + Err(err) => return Err(err), } - Err(err) => return Err(err), } } } else { From e3c28c15f226ebd90daca3794cfe4ef2aac8cbf4 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Mon, 3 Nov 2025 00:01:01 +0100 Subject: [PATCH 05/26] test: reserve unique ports for integration harness --- Cargo.lock | 1 + crates/core/Cargo.toml | 1 + crates/core/src/test_utils.rs | 48 ++++++++++++++++++++++++++++ crates/freenet-macros/src/codegen.rs | 18 +++-------- 4 files changed, 54 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f8cd93d59..7789079ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1770,6 +1770,7 @@ dependencies = [ "httptest", "itertools 0.14.0", "notify", + "once_cell", "opentelemetry 0.31.0", "opentelemetry-jaeger", "opentelemetry-otlp", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 081abf00f..eb535d22d 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -44,6 +44,7 @@ pav_regression = "0.6.1" parking_lot = "0.12" pin-project = "1" rand = { features = ["small_rng"], workspace = true } +once_cell = "1" redb = { optional = true, version = "3" } serde = { features = ["derive", "rc"], workspace = true } serde_json = { workspace = true } diff --git a/crates/core/src/test_utils.rs b/crates/core/src/test_utils.rs index d2c7b406b..57cf57a00 100644 --- a/crates/core/src/test_utils.rs +++ b/crates/core/src/test_utils.rs @@ -8,10 +8,12 @@ use std::{ }; use clap::ValueEnum; +use dashmap::DashSet; use freenet_stdlib::{ client_api::{ClientRequest, ContractRequest, WebApi}, prelude::*, }; +use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use crate::util::workspace::get_workspace_target_dir; @@ -810,6 +812,41 @@ mod test { } } +// Port reservation utilities for integration tests +static RESERVED_PORTS: Lazy> = Lazy::new(DashSet::new); + +/// Reserve a unique localhost TCP port for tests. +/// +/// Ports are allocated by binding to an ephemeral listener to ensure the port +/// is currently free, then tracked in a global set so concurrent tests do not +/// reuse the same value. Ports remain reserved until released via +/// [`release_local_port`]. +pub fn reserve_local_port() -> anyhow::Result { + const MAX_ATTEMPTS: usize = 128; + for _ in 0..MAX_ATTEMPTS { + let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) + .map_err(|e| anyhow::anyhow!("failed to bind ephemeral port: {e}"))?; + let port = listener + .local_addr() + .map_err(|e| anyhow::anyhow!("failed to read ephemeral port address: {e}"))? + .port(); + drop(listener); + + if RESERVED_PORTS.insert(port) { + return Ok(port); + } + } + + Err(anyhow::anyhow!( + "failed to reserve a unique local port after {MAX_ATTEMPTS} attempts" + )) +} + +/// Release a previously reserved port so future tests may reuse it. +pub fn release_local_port(port: u16) { + RESERVED_PORTS.remove(&port); +} + // Test context for integration tests use std::collections::HashMap; @@ -1318,6 +1355,17 @@ impl TestContext { } } +impl Drop for TestContext { + fn drop(&mut self) { + for node in self.nodes.values() { + release_local_port(node.ws_port); + if let Some(port) = node.network_port { + release_local_port(port); + } + } + } +} + // Event aggregator test utilities pub mod event_aggregator_utils { //! Test utilities for event log aggregation. diff --git a/crates/freenet-macros/src/codegen.rs b/crates/freenet-macros/src/codegen.rs index 0344e1d75..e2b5f12d9 100644 --- a/crates/freenet-macros/src/codegen.rs +++ b/crates/freenet-macros/src/codegen.rs @@ -124,13 +124,8 @@ fn generate_node_setup(args: &FreenetTestArgs) -> TokenStream { key.save(&transport_keypair)?; key.public().save(temp_dir.path().join("public.pem"))?; - let network_socket = std::net::TcpListener::bind("127.0.0.1:0")?; - let ws_socket = std::net::TcpListener::bind("127.0.0.1:0")?; - let network_port = network_socket.local_addr()?.port(); - let ws_port = ws_socket.local_addr()?.port(); - - std::mem::drop(network_socket); - std::mem::drop(ws_socket); + let network_port = freenet::test_utils::reserve_local_port()?; + let ws_port = freenet::test_utils::reserve_local_port()?; let location: f64 = rand::Rng::random(&mut rand::rng()); @@ -239,13 +234,8 @@ fn generate_node_setup(args: &FreenetTestArgs) -> TokenStream { key.save(&transport_keypair)?; key.public().save(temp_dir.path().join("public.pem"))?; - let network_socket = std::net::TcpListener::bind("127.0.0.1:0")?; - let network_port = network_socket.local_addr()?.port(); - std::mem::drop(network_socket); - - let ws_socket = std::net::TcpListener::bind("127.0.0.1:0")?; - let ws_port = ws_socket.local_addr()?.port(); - std::mem::drop(ws_socket); + let network_port = freenet::test_utils::reserve_local_port()?; + let ws_port = freenet::test_utils::reserve_local_port()?; let location: f64 = rand::Rng::random(&mut rand::rng()); From d302f3614605a05cd0c3da4bd3a946be9fd3f571 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Mon, 3 Nov 2025 00:13:21 +0100 Subject: [PATCH 06/26] test: harden packet corruption detection --- crates/core/src/transport/packet_data.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/core/src/transport/packet_data.rs b/crates/core/src/transport/packet_data.rs index 8a8cc368b..2128a8a4f 100644 --- a/crates/core/src/transport/packet_data.rs +++ b/crates/core/src/transport/packet_data.rs @@ -305,8 +305,9 @@ mod tests { let unencrypted_packet = PacketData::<_, 1000>::from_buf_plain(data); let mut encrypted_packet = unencrypted_packet.encrypt_symmetric(&cipher); - // Corrupt the packet data - encrypted_packet.data[encrypted_packet.size / 2] = 0; + // Corrupt the packet data by flipping bits at a deterministic position. + let mid = encrypted_packet.size / 2; + encrypted_packet.data[mid] ^= 0xFF; // Ensure decryption fails match encrypted_packet.decrypt(&cipher) { From f12ecf596c476d48ed39af7d830f4ca98e974143 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Mon, 3 Nov 2025 00:39:24 +0100 Subject: [PATCH 07/26] test: retry flaky three-hop PUT path --- crates/core/tests/operations.rs | 133 +++++++++++++++++++++----------- 1 file changed, 87 insertions(+), 46 deletions(-) diff --git a/crates/core/tests/operations.rs b/crates/core/tests/operations.rs index 150c5cc11..06cac201b 100644 --- a/crates/core/tests/operations.rs +++ b/crates/core/tests/operations.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, bail}; +use anyhow::{anyhow, bail, ensure}; use freenet::{ config::{ConfigArgs, InlineGwConfig, NetworkArgs, SecretArgs, WebsocketApiArgs}, dev_tool::TransportKeypair, @@ -128,6 +128,86 @@ async fn get_contract( } } +async fn send_put_with_retry( + client: &mut WebApi, + state: WrappedState, + contract: ContractContainer, + description: &str, + expected_key: Option, +) -> anyhow::Result<()> { + const MAX_ATTEMPTS: usize = 3; + for attempt in 1..=MAX_ATTEMPTS { + tracing::info!("Sending {} (attempt {attempt}/{MAX_ATTEMPTS})", description); + + make_put(client, state.clone(), contract.clone(), false).await?; + + match tokio::time::timeout(Duration::from_secs(120), client.recv()).await { + Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { + if let Some(expected) = expected_key { + ensure!( + key == expected, + "{} returned unexpected contract key (expected {}, got {})", + description, + expected, + key + ); + } + tracing::info!("{description} succeeded on attempt {attempt}"); + return Ok(()); + } + Ok(Ok(other)) => { + tracing::warn!( + "{} attempt {attempt} returned unexpected response: {:?}", + description, + other + ); + } + Ok(Err(e)) => { + tracing::warn!( + "{} attempt {attempt} failed while receiving response: {}", + description, + e + ); + } + Err(_) => { + tracing::warn!( + "{} attempt {attempt} timed out waiting for response", + description + ); + } + } + + if attempt == MAX_ATTEMPTS { + bail!("{description} failed after {MAX_ATTEMPTS} attempts"); + } + + // Drain any stray responses/errors before retrying to keep the client state clean. + loop { + match tokio::time::timeout(Duration::from_millis(200), client.recv()).await { + Ok(Ok(resp)) => { + tracing::warn!( + "Discarding stray response prior to retrying {}: {:?}", + description, + resp + ); + } + Ok(Err(err)) => { + tracing::warn!( + "Discarding stray error prior to retrying {}: {}", + description, + err + ); + } + Err(_) => break, + } + } + + tokio::time::sleep(Duration::from_secs(3)).await; + } + + unreachable!("send_put_with_retry loop should always return or bail"); +} + /// Test PUT operation across two peers (gateway and peer) #[freenet_test( nodes = ["gateway", "peer-a"], @@ -443,34 +523,15 @@ async fn test_put_merge_persists_state(ctx: &mut TestContext) -> TestResult { let (stream, _) = connect_async(&uri).await?; let mut client_api_a = WebApi::start(stream); - // First PUT: Store initial contract state - tracing::info!("Sending first PUT with initial state..."); - make_put( + send_put_with_retry( &mut client_api_a, initial_wrapped_state.clone(), contract.clone(), - false, + "first PUT (cache seed)", + Some(contract_key), ) .await?; - // Wait for first put response - let resp = tokio::time::timeout(Duration::from_secs(120), client_api_a.recv()).await; - match resp { - Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { - tracing::info!("First PUT successful for contract: {}", key); - assert_eq!(key, contract_key); - } - Ok(Ok(other)) => { - bail!("Unexpected response for first PUT: {:?}", other); - } - Ok(Err(e)) => { - bail!("Error receiving first PUT response: {}", e); - } - Err(_) => { - bail!("Timeout waiting for first PUT response"); - } - } - // Wait a bit to ensure state is fully cached tokio::time::sleep(Duration::from_secs(2)).await; @@ -498,35 +559,15 @@ async fn test_put_merge_persists_state(ctx: &mut TestContext) -> TestResult { updated_wrapped_state.as_ref().len() ); - // Second PUT: Update the already-cached contract with new state - // This tests the bug fix - the merged state should be persisted - tracing::info!("Sending second PUT with updated state..."); - make_put( + send_put_with_retry( &mut client_api_a, updated_wrapped_state.clone(), contract.clone(), - false, + "second PUT (merge)", + Some(contract_key), ) .await?; - // Wait for second put response - let resp = tokio::time::timeout(Duration::from_secs(120), client_api_a.recv()).await; - match resp { - Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { - tracing::info!("Second PUT successful for contract: {}", key); - assert_eq!(key, contract_key); - } - Ok(Ok(other)) => { - bail!("Unexpected response for second PUT: {:?}", other); - } - Ok(Err(e)) => { - bail!("Error receiving second PUT response: {}", e); - } - Err(_) => { - bail!("Timeout waiting for second PUT response"); - } - } - // Wait a bit to ensure the merge and persistence completes tokio::time::sleep(Duration::from_secs(2)).await; From 349cd08c201275e4928069a1a948bb2e01452eea Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Mon, 3 Nov 2025 01:06:03 +0100 Subject: [PATCH 08/26] fix: keep peers running after handshake drop --- .../src/node/network_bridge/p2p_protoc.rs | 112 ++++++++++++++++-- 1 file changed, 104 insertions(+), 8 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index a57f23b0c..c9b981547 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -465,8 +465,7 @@ impl P2pConnManager { } ConnEvent::ClosedChannel(reason) => { match reason { - ChannelCloseReason::Handshake - | ChannelCloseReason::Bridge + ChannelCloseReason::Bridge | ChannelCloseReason::Controller | ChannelCloseReason::Notification | ChannelCloseReason::OpExecution => { @@ -942,10 +941,13 @@ impl P2pConnManager { Ok(EventResult::Continue) } Err(handshake_error) => { - tracing::error!(?handshake_error, "Handshake handler error"); - Ok(EventResult::Event( - ConnEvent::ClosedChannel(ChannelCloseReason::Handshake).into(), - )) + tracing::warn!( + ?handshake_error, + "Handshake handler yielded error; cleaning up pending connections" + ); + self.handle_handshake_failure(handshake_error, state) + .await?; + Ok(EventResult::Continue) } } } @@ -1462,6 +1464,102 @@ impl P2pConnManager { Ok(()) } + async fn handle_handshake_failure( + &mut self, + handshake_error: HandshakeError, + state: &mut EventListenerState, + ) -> anyhow::Result<()> { + match handshake_error { + HandshakeError::ConnectionClosed(addr) => { + let pending_txs = state + .awaiting_connection_txs + .remove(&addr) + .unwrap_or_default(); + if let Some(callbacks) = state.awaiting_connection.remove(&addr) { + tracing::info!( + remote = %addr, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "Notifying callbacks after handshake connection closed" + ); + + let mut callbacks = callbacks.into_iter(); + if let Some(mut cb) = callbacks.next() { + cb.send_result(Err(HandshakeError::ConnectionClosed(addr))) + .await + .inspect_err(|err| { + tracing::debug!( + remote = %addr, + error = ?err, + "Failed to notify primary handshake callback" + ); + }) + .ok(); + } + + for mut cb in callbacks { + cb.send_result(Err(HandshakeError::ChannelClosed)) + .await + .inspect_err(|err| { + tracing::debug!( + remote = %addr, + error = ?err, + "Failed to notify fallback handshake callback" + ); + }) + .ok(); + } + } + + // Drop any pending transient transactions bound to this address + state + .transient_conn + .retain(|_, socket_addr| socket_addr != &addr); + } + HandshakeError::ChannelClosed => { + if !state.awaiting_connection.is_empty() { + tracing::warn!( + awaiting = state.awaiting_connection.len(), + "Handshake channel closed; notifying all pending callbacks" + ); + } + + let awaiting = std::mem::take(&mut state.awaiting_connection); + let awaiting_txs = std::mem::take(&mut state.awaiting_connection_txs); + + for (addr, callbacks) in awaiting { + let pending_txs = awaiting_txs.get(&addr).cloned().unwrap_or_default(); + tracing::debug!( + remote = %addr, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "Delivering channel-closed notification to pending callbacks" + ); + for mut cb in callbacks { + cb.send_result(Err(HandshakeError::ChannelClosed)) + .await + .inspect_err(|err| { + tracing::debug!( + remote = %addr, + error = ?err, + "Failed to deliver channel-closed handshake notification" + ); + }) + .ok(); + } + } + } + other => { + tracing::warn!( + ?other, + "Unhandled handshake error without socket association" + ); + } + } + + Ok(()) + } + async fn try_to_forward(&mut self, forward_to: &PeerId, msg: NetMessage) -> anyhow::Result<()> { if let Some(peer) = self.connections.get(forward_to) { tracing::debug!(%forward_to, %msg, "Forwarding message to peer"); @@ -1851,8 +1949,6 @@ pub(super) enum ConnEvent { #[derive(Debug)] pub(super) enum ChannelCloseReason { - /// Handshake channel closed - potentially transient, continue operation - Handshake, /// Internal bridge channel closed - critical, must shutdown gracefully Bridge, /// Node controller channel closed - critical, must shutdown gracefully From 0bf2c1441c62d82a1a1a26613a1a9359f2f86fcc Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Mon, 3 Nov 2025 01:16:21 +0100 Subject: [PATCH 09/26] test: default harness logging to pretty format --- crates/freenet-macros/src/codegen.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/freenet-macros/src/codegen.rs b/crates/freenet-macros/src/codegen.rs index e2b5f12d9..7f470f566 100644 --- a/crates/freenet-macros/src/codegen.rs +++ b/crates/freenet-macros/src/codegen.rs @@ -62,10 +62,11 @@ pub fn generate_test_code(args: FreenetTestArgs, input_fn: ItemFn) -> Result Date: Mon, 3 Nov 2025 23:37:49 +0100 Subject: [PATCH 10/26] refactor(connect): checkpoint handshake adjustments Tested: cargo test --test message_flow river_message_flow_over_freenet -- --ignored --exact --nocapture Tested: cargo test --test message_flow river_message_flow_over_freenet_six_peers_five_rounds -- --ignored --exact --nocapture --- crates/core/src/client_events/mod.rs | 4 +- crates/core/src/message.rs | 7 + .../src/node/network_bridge/p2p_protoc.rs | 33 ++-- crates/core/src/node/testing_impl.rs | 4 + crates/core/src/operations/connect.rs | 161 ++++++++++++------ crates/core/src/ring/connection_manager.rs | 30 ++-- crates/core/src/ring/mod.rs | 32 ++-- crates/core/src/router/mod.rs | 11 +- .../core/src/transport/connection_handler.rs | 151 ++++++++-------- crates/core/src/transport/peer_connection.rs | 2 +- crates/freenet-macros/src/codegen.rs | 6 +- 11 files changed, 260 insertions(+), 181 deletions(-) diff --git a/crates/core/src/client_events/mod.rs b/crates/core/src/client_events/mod.rs index 8fd6d42c4..894bd1614 100644 --- a/crates/core/src/client_events/mod.rs +++ b/crates/core/src/client_events/mod.rs @@ -1155,10 +1155,10 @@ async fn process_open_request( tracing::debug!( peer_id = %peer_id, key = %key, - "Starting direct SUBSCRIBE operation (legacy mode)", + "Starting direct SUBSCRIBE operation", ); - // Legacy mode: generate transaction, register first, then run op + // Generate transaction, register first, then run op let tx = crate::message::Transaction::new::< crate::operations::subscribe::SubscribeMsg, >(); diff --git a/crates/core/src/message.rs b/crates/core/src/message.rs index 8312bd735..4080bff20 100644 --- a/crates/core/src/message.rs +++ b/crates/core/src/message.rs @@ -363,6 +363,10 @@ pub(crate) enum NodeEvent { key: ContractKey, subscribed: bool, }, + /// Register expectation for an inbound connection from the given peer. + ExpectPeerConnection { + peer: PeerId, + }, /// Send a message to a peer over the network SendMessage { target: PeerId, @@ -444,6 +448,9 @@ impl Display for NodeEvent { "Local subscribe complete (tx: {tx}, key: {key}, subscribed: {subscribed})" ) } + NodeEvent::ExpectPeerConnection { peer } => { + write!(f, "ExpectPeerConnection (from {peer})") + } NodeEvent::SendMessage { target, msg } => { write!(f, "SendMessage (to {target}, tx: {})", msg.id()) } diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index c9b981547..e9b895be0 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -32,7 +32,8 @@ use crate::node::{MessageProcessor, PeerId}; use crate::operations::{connect::ConnectMsg, get::GetMsg, put::PutMsg, update::UpdateMsg}; use crate::ring::Location; use crate::transport::{ - create_connection_handler, PeerConnection, TransportError, TransportKeypair, + create_connection_handler, OutboundConnectionHandler, PeerConnection, TransportError, + TransportKeypair, }; use crate::{ client_events::ClientId, @@ -193,6 +194,16 @@ impl P2pConnManager { message_processor, } = self; + let (outbound_conn_handler, inbound_conn_handler) = create_connection_handler::( + key_pair.clone(), + listening_ip, + listening_port, + is_gateway, + bandwidth_limit, + if is_gateway { &[] } else { &gateways }, + ) + .await?; + tracing::info!( %listening_port, %listening_ip, @@ -201,22 +212,13 @@ impl P2pConnManager { "Opening network listener - will receive from channel" ); - let mut state = EventListenerState::new(); + let mut state = EventListenerState::new(outbound_conn_handler.clone()); // Separate peer_connections to allow independent borrowing by the stream let peer_connections: FuturesUnordered< BoxFuture<'static, Result>, > = FuturesUnordered::new(); - let (outbound_conn_handler, inbound_conn_handler) = create_connection_handler::( - key_pair.clone(), - listening_ip, - listening_port, - is_gateway, - bandwidth_limit, - ) - .await?; - // For non-gateway peers, pass the peer_ready flag so it can be set after first handshake // For gateways, pass None (they're always ready) let peer_ready = if !is_gateway { @@ -566,6 +568,10 @@ impl P2pConnManager { ) .await?; } + NodeEvent::ExpectPeerConnection { peer } => { + tracing::debug!(%peer, "ExpectPeerConnection event received; registering inbound expectation"); + state.outbound_handler.expect_incoming(peer.addr); + } NodeEvent::SendMessage { target, msg } => { // Send the message to the target peer over the network tracing::debug!( @@ -1113,6 +1119,7 @@ impl P2pConnManager { pending_txs = ?txs_entry, "connect_peer: registered new pending connection" ); + state.outbound_handler.expect_incoming(peer_addr); } } tracing::debug!( @@ -1909,6 +1916,7 @@ impl ConnectResultSender for mpsc::Sender), ()>> { } struct EventListenerState { + outbound_handler: OutboundConnectionHandler, // Note: peer_connections has been moved out to allow separate borrowing by the stream pending_from_executor: HashSet, // FIXME: we are potentially leaving trash here when transacrions are completed @@ -1921,8 +1929,9 @@ struct EventListenerState { } impl EventListenerState { - fn new() -> Self { + fn new(outbound_handler: OutboundConnectionHandler) -> Self { Self { + outbound_handler, pending_from_executor: HashSet::new(), tx_to_client: HashMap::new(), client_waiting_transaction: Vec::new(), diff --git a/crates/core/src/node/testing_impl.rs b/crates/core/src/node/testing_impl.rs index cb3b30ce2..562285c7b 100644 --- a/crates/core/src/node/testing_impl.rs +++ b/crates/core/src/node/testing_impl.rs @@ -935,6 +935,10 @@ where NodeEvent::QueryNodeDiagnostics { .. } => { unimplemented!() } + NodeEvent::ExpectPeerConnection { peer } => { + tracing::debug!(%peer, "ExpectPeerConnection ignored in testing impl"); + continue; + } NodeEvent::SendMessage { target, msg } => { tracing::debug!(tx = %msg.id(), %target, "SendMessage event in testing_impl"); conn_manager.send(&target, *msg).await?; diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 589c27afc..993be2a0d 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -338,12 +338,36 @@ impl Operation for ConnectOp { "Checking connectivity request received" ); - let should_accept = if op_manager + let requested_accept = op_manager .ring .connection_manager - .should_accept(joiner_loc, &joiner.peer) - { + .should_accept(joiner_loc, &joiner.peer); + let acceptance_status = if requested_accept { tracing::info!(tx = %id, %joiner, "CheckConnectivity: Accepting connection from, will trigger ConnectPeer"); + // Ensure the transport layer is ready for the incoming handshake before we notify upstream. + op_manager + .notify_node_event(NodeEvent::ExpectPeerConnection { + peer: joiner.peer.clone(), + }) + .await?; + if sender.peer != this_peer.peer { + let accept_msg = ConnectMsg::Response { + id: *id, + sender: this_peer.clone(), + target: sender.clone(), + msg: ConnectResponse::AcceptedBy { + accepted: true, + acceptor: this_peer.clone(), + joiner: joiner.peer.clone(), + }, + }; + op_manager + .notify_node_event(NodeEvent::SendMessage { + target: sender.peer.clone(), + msg: Box::new(NetMessage::from(accept_msg)), + }) + .await?; + } let (callback, mut result) = tokio::sync::mpsc::channel(10); // Attempt to connect to the joiner op_manager @@ -354,6 +378,7 @@ impl Operation for ConnectOp { is_gw: false, }) .await?; + let mut status = true; match result.recv().await.ok_or(OpError::NotificationError)? { Ok((peer_id, remaining_checks)) => { tracing::info!( @@ -363,15 +388,11 @@ impl Operation for ConnectOp { remaining_checks, "ConnectPeer completed successfully" ); - let was_reserved = { - // reserved just above in call to should_accept - true - }; + let was_reserved = true; // reserved just above in call to should_accept op_manager .ring .add_connection(joiner_loc, joiner.peer.clone(), was_reserved) .await; - true } Err(()) => { tracing::info!( @@ -383,11 +404,48 @@ impl Operation for ConnectOp { .ring .connection_manager .prune_in_transit_connection(&joiner.peer); - false + status = false; + if sender.peer != this_peer.peer { + let decline_msg = ConnectMsg::Response { + id: *id, + sender: this_peer.clone(), + target: sender.clone(), + msg: ConnectResponse::AcceptedBy { + accepted: false, + acceptor: this_peer.clone(), + joiner: joiner.peer.clone(), + }, + }; + op_manager + .notify_node_event(NodeEvent::SendMessage { + target: sender.peer.clone(), + msg: Box::new(NetMessage::from(decline_msg)), + }) + .await?; + } } } + status } else { tracing::debug!(tx = %id, at = %this_peer.peer, from = %joiner, "Rejecting connection"); + if sender.peer != this_peer.peer { + let decline_msg = ConnectMsg::Response { + id: *id, + sender: this_peer.clone(), + target: sender.clone(), + msg: ConnectResponse::AcceptedBy { + accepted: false, + acceptor: this_peer.clone(), + joiner: joiner.peer.clone(), + }, + }; + op_manager + .notify_node_event(NodeEvent::SendMessage { + target: sender.peer.clone(), + msg: Box::new(NetMessage::from(decline_msg)), + }) + .await?; + } false }; @@ -402,7 +460,7 @@ impl Operation for ConnectOp { ForwardParams { left_htl: hops_left, max_htl, - accepted: should_accept, + accepted: requested_accept, skip_connections: skip_connections.clone(), skip_forwards: skip_forwards.clone(), req_peer: sender.clone(), @@ -418,18 +476,17 @@ impl Operation for ConnectOp { } } - let response = ConnectResponse::AcceptedBy { - accepted: should_accept, - acceptor: this_peer.clone(), - joiner: joiner.peer.clone(), - }; - - return_msg = Some(ConnectMsg::Response { + let response_msg = ConnectMsg::Response { id: *id, sender: this_peer.clone(), - msg: response, target: sender.clone(), - }); + msg: ConnectResponse::AcceptedBy { + accepted: acceptance_status, + acceptor: this_peer.clone(), + joiner: joiner.peer.clone(), + }, + }; + return_msg = Some(response_msg); } ConnectMsg::Response { id, @@ -469,6 +526,14 @@ impl Operation for ConnectOp { connected_to = %acceptor.peer, "Open connection acknowledged at requesting joiner peer", ); + if acceptor.peer != this_peer_id { + // Ensure inbound handshake packets from the acceptor aren't dropped. + op_manager + .notify_node_event(NodeEvent::ExpectPeerConnection { + peer: acceptor.peer.clone(), + }) + .await?; + } tracing::info!( tx = %id, joiner = %this_peer_id, @@ -482,7 +547,7 @@ impl Operation for ConnectOp { .add_connection( acceptor.location.expect("location not found"), acceptor.peer.clone(), - true, // we reserved the connection to this peer before asking to join + true, ) .await; } else { @@ -590,6 +655,21 @@ impl Operation for ConnectOp { let remaining_connections = info.remaining_connections.saturating_sub(1); + if *accepted && *joiner == this_peer_id && acceptor.peer != this_peer_id + { + tracing::debug!( + tx = %id, + at = %this_peer_id, + acceptor = %acceptor.peer, + "Forward path accepted connection; registering inbound expectation" + ); + op_manager + .notify_node_event(NodeEvent::ExpectPeerConnection { + peer: acceptor.peer.clone(), + }) + .await?; + } + if remaining_connections == 0 { tracing::debug!( tx = %id, @@ -1118,8 +1198,6 @@ where let num_connections = connection_manager.num_connections(); let num_reserved = connection_manager.get_reserved_connections(); - let max_connections = connection_manager.max_connections; - tracing::info!( tx = %id, joiner = %joiner.peer, @@ -1127,35 +1205,13 @@ where num_reserved = %num_reserved, is_gateway = %is_gateway, accepted = %accepted, + skip_connections_count = %skip_connections.len(), + skip_forwards_count = %skip_forwards.len(), "forward_conn: checking connection forwarding", ); - // Special case: Gateway bootstrap when starting with zero connections AND only one reserved - // Note: num_reserved will be 1 (not 0) because should_accept() already reserved a slot - // for this connection. This ensures only the very first connection is accepted directly, - // avoiding race conditions where multiple concurrent join attempts would all be accepted directly. - // - // IMPORTANT: Bootstrap acceptances are marked with is_bootstrap_acceptance=true so that - // the handshake handler (see handshake.rs forward_or_accept_join) can immediately register - // the connection in the ring. This bypasses the normal CheckConnectivity flow which doesn't - // apply to bootstrap since: - // 1. There are no other peers to forward to - // 2. The "already connected" bug doesn't apply (this is the first connection) - // 3. We need the connection registered so the gateway can respond to FindOptimalPeer requests - // - // See PR #1871 discussion with @iduartgomez for context. - // - // IMPORTANT (issue #1908): Extended to cover early network formation (only the very first peer). - // During bootstrap we keep the first connection direct to guarantee bidirectional connectivity; - // subsequent peers should be forwarded through existing nodes. - // - // However, we still respect max_connections - this only applies when there's capacity. - const EARLY_NETWORK_THRESHOLD: usize = 4; - let has_capacity = num_connections + num_reserved < max_connections; - if is_gateway - && accepted - && (num_connections == 0 || (num_connections < EARLY_NETWORK_THRESHOLD && has_capacity)) - { + // Bootstrap: gateway has no neighbours yet, so we keep the courtesy link and stop here. + if is_gateway && accepted && num_connections == 0 { if num_reserved != 1 { tracing::debug!( tx = %id, @@ -1167,11 +1223,9 @@ where tracing::info!( tx = %id, joiner = %joiner.peer, - connections = num_connections, - has_capacity = %has_capacity, - "Gateway early network: accepting connection directly (will register immediately)", + "Gateway bootstrap: accepting first neighbour directly" ); - let connectivity_info = ConnectivityInfo::new_bootstrap(joiner.clone(), 1); // Single check for direct connection + let connectivity_info = ConnectivityInfo::new_bootstrap(joiner.clone(), 1); return Ok(Some(ConnectState::AwaitingConnectivity(connectivity_info))); } @@ -1231,7 +1285,8 @@ where target_peer ); network_bridge.send(&target_peer.peer, forward_msg).await?; - return update_state_with_forward_info(&req_peer, left_htl); + let forwarded_state = update_state_with_forward_info(&req_peer, left_htl)?; + return Ok(forwarded_state); } None => { // Couldn't find suitable peer to forward to diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 2f326b978..bd9d6a7b9 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -160,6 +160,8 @@ impl ConnectionManager { open, reserved_before, is_gateway = self.is_gateway, + min = self.min_connections, + max = self.max_connections, "should_accept: evaluating direct acceptance guard" ); @@ -178,7 +180,7 @@ impl ConnectionManager { + open; if open == 0 { - // if this is the first connection, then accept it + tracing::debug!(%peer_id, "should_accept: first connection -> accepting"); return true; } @@ -195,23 +197,22 @@ impl ConnectionManager { ); self.reserved_connections .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + tracing::info!(%peer_id, "should_accept: gateway direct-accept limit hit, forwarding instead"); return false; } } if self.location_for_peer.read().get(peer_id).is_some() { - // avoid connecting more than once to the same peer - self.reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); - tracing::debug!(%peer_id, "Peer already connected"); - return false; + // We've already accepted this peer (pending or active); treat as a no-op acceptance. + tracing::debug!(%peer_id, "Peer already pending/connected; acknowledging acceptance"); + return true; } let accepted = if total_conn < self.min_connections { - tracing::info!(%peer_id, "Accepted connection, below min connections"); + tracing::info!(%peer_id, total_conn, "should_accept: accepted (below min connections)"); true } else if total_conn >= self.max_connections { - tracing::info!(%peer_id, "Rejected connection, max connections reached"); + tracing::info!(%peer_id, total_conn, "should_accept: rejected (max connections reached)"); false } else { let accepted = self @@ -220,18 +221,19 @@ impl ConnectionManager { .evaluate_new_connection(location, Instant::now()) .unwrap_or(true); - if accepted { - tracing::info!(%peer_id, "Accepted connection, topology manager"); - } else { - tracing::info!(%peer_id, "Rejected connection, topology manager"); - } + tracing::info!( + %peer_id, + total_conn, + accepted, + "should_accept: topology manager decision" + ); accepted }; if !accepted { self.reserved_connections .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); } else { - tracing::info!(%peer_id, "Accepted connection, reserving spot"); + tracing::info!(%peer_id, total_conn, "should_accept: accepted (reserving spot)"); self.record_pending_location(peer_id, location); } accepted diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index 6ee9330eb..ec07cab30 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -270,38 +270,38 @@ impl Ring { let router = self.router.read(); let target_location = Location::from(contract_key); - let mut candidates: BTreeMap = BTreeMap::new(); + let mut seen = HashSet::new(); + let mut candidates: Vec = Vec::new(); let connections = self.connection_manager.get_connections_by_location(); for conns in connections.values() { for conn in conns { let peer = conn.location.peer.clone(); - if skip_list.has_element(peer.clone()) { + if skip_list.has_element(peer.clone()) || !seen.insert(peer) { continue; } - candidates - .entry(peer) - .or_insert_with(|| conn.location.clone()); + candidates.push(conn.location.clone()); } } - let known_locations = self.connection_manager.get_known_locations(); - for (peer, location) in known_locations { - if skip_list.has_element(peer.clone()) { - continue; - } - candidates - .entry(peer.clone()) - .or_insert_with(|| PeerKeyLocation { + if candidates.len() < k { + let known_locations = self.connection_manager.get_known_locations(); + for (peer, location) in known_locations { + if skip_list.has_element(peer.clone()) || !seen.insert(peer.clone()) { + continue; + } + candidates.push(PeerKeyLocation { peer, location: Some(location), }); + if candidates.len() >= k { + break; + } + } } - let peers = candidates.values(); - router - .select_k_best_peers(peers, target_location, k) + .select_k_best_peers(candidates.iter(), target_location, k) .into_iter() .cloned() .collect() diff --git a/crates/core/src/router/mod.rs b/crates/core/src/router/mod.rs index 39a5ea8c1..f5749154b 100644 --- a/crates/core/src/router/mod.rs +++ b/crates/core/src/router/mod.rs @@ -205,12 +205,11 @@ impl Router { let mut peer_distances: Vec<_> = peers .into_iter() - .map(|peer| { - let distance = peer - .location - .map(|loc| target_location.distance(loc)) - .unwrap_or_else(|| Distance::new(0.5)); - (peer, distance) + .filter_map(|peer| { + peer.location.map(|loc| { + let distance = target_location.distance(loc); + (peer, distance) + }) }) .collect(); diff --git a/crates/core/src/transport/connection_handler.rs b/crates/core/src/transport/connection_handler.rs index 231f502d4..90899c148 100644 --- a/crates/core/src/transport/connection_handler.rs +++ b/crates/core/src/transport/connection_handler.rs @@ -4,14 +4,15 @@ use std::net::{IpAddr, SocketAddr}; use std::pin::Pin; use std::sync::atomic::AtomicU32; use std::sync::Arc; -use std::sync::Mutex; use std::time::{Duration, Instant}; use crate::config::PCK_VERSION; +use crate::ring::PeerKeyLocation; use crate::transport::crypto::TransportSecretKey; use crate::transport::packet_data::{AssymetricRSA, UnknownEncryption}; use crate::transport::symmetric_message::OutboundConnection; use aes_gcm::{Aes128Gcm, KeyInit}; +use dashmap::DashSet; use futures::{ future::BoxFuture, stream::{FuturesUnordered, StreamExt}, @@ -37,9 +38,7 @@ use super::{ }; // Constants for interval increase -const INITIAL_INTERVAL: Duration = Duration::from_millis(200); -const INTERVAL_INCREASE_FACTOR: u64 = 2; -const MAX_INTERVAL: Duration = Duration::from_millis(5000); // Maximum interval limit +const INITIAL_INTERVAL: Duration = Duration::from_millis(50); const DEFAULT_BW_TRACKER_WINDOW_SIZE: Duration = Duration::from_secs(10); @@ -66,6 +65,7 @@ pub(crate) async fn create_connection_handler( listen_port: u16, is_gateway: bool, bandwidth_limit: Option, + known_gateways: &[PeerKeyLocation], ) -> Result<(OutboundConnectionHandler, InboundConnectionHandler), TransportError> { // Bind the UDP socket to the specified port let bind_addr: SocketAddr = (listen_host, listen_port).into(); @@ -82,12 +82,23 @@ pub(crate) async fn create_connection_handler( is_gateway, "UDP socket bound successfully" ); + let gateway_addrs: Option>> = if is_gateway { + None + } else { + Some(Arc::new( + known_gateways + .iter() + .map(|g| g.peer.addr) + .collect::>(), + )) + }; let (och, new_connection_notifier) = OutboundConnectionHandler::config_listener( Arc::new(socket), keypair, is_gateway, (listen_host, listen_port).into(), bandwidth_limit, + gateway_addrs.clone(), )?; Ok(( och, @@ -121,7 +132,7 @@ impl InboundConnectionHandler { #[derive(Clone)] pub(crate) struct OutboundConnectionHandler { send_queue: mpsc::Sender<(SocketAddr, ConnectionEvent)>, - expected_non_gateway: Arc>>, + expected_non_gateway: Arc>, } #[cfg(test)] @@ -129,7 +140,7 @@ impl OutboundConnectionHandler { pub fn new(send_queue: mpsc::Sender<(SocketAddr, ConnectionEvent)>) -> Self { OutboundConnectionHandler { send_queue, - expected_non_gateway: Arc::new(Mutex::new(HashSet::new())), + expected_non_gateway: Arc::new(DashSet::new()), } } } @@ -141,6 +152,7 @@ impl OutboundConnectionHandler { is_gateway: bool, socket_addr: SocketAddr, bandwidth_limit: Option, + known_gateway_addrs: Option>>, ) -> Result<(Self, mpsc::Receiver), TransportError> { // Channel buffer is one so senders will await until the receiver is ready, important for bandwidth limiting let (conn_handler_sender, conn_handler_receiver) = mpsc::channel(100); @@ -148,7 +160,7 @@ impl OutboundConnectionHandler { // Channel buffer is one so senders will await until the receiver is ready, important for bandwidth limiting let (outbound_sender, outbound_recv) = mpsc::channel(100); - let expected_non_gateway = Arc::new(Mutex::new(HashSet::new())); + let expected_non_gateway = Arc::new(DashSet::new()); let transport = UdpPacketsListener { is_gateway, @@ -163,6 +175,7 @@ impl OutboundConnectionHandler { last_drop_warning: Instant::now(), bandwidth_limit, expected_non_gateway: expected_non_gateway.clone(), + known_gateway_addrs: known_gateway_addrs.clone(), }; let bw_tracker = super::rate_limiter::PacketRateLimiter::new( DEFAULT_BW_TRACKER_WINDOW_SIZE, @@ -198,7 +211,7 @@ impl OutboundConnectionHandler { keypair: TransportKeypair, is_gateway: bool, ) -> Result<(Self, mpsc::Receiver), TransportError> { - Self::config_listener(socket, keypair, is_gateway, socket_addr, None) + Self::config_listener(socket, keypair, is_gateway, socket_addr, None, None) } pub async fn connect( @@ -206,8 +219,8 @@ impl OutboundConnectionHandler { remote_public_key: TransportPublicKey, remote_addr: SocketAddr, ) -> Pin> + Send>> { - if let Ok(mut guard) = self.expected_non_gateway.lock() { - guard.insert(remote_addr.ip()); + if self.expected_non_gateway.insert(remote_addr.ip()) { + tracing::debug!(%remote_addr, "awaiting outbound handshake response from remote IP"); } let (open_connection, recv_connection) = oneshot::channel(); if self @@ -234,6 +247,12 @@ impl OutboundConnectionHandler { }) .boxed() } + + pub fn expect_incoming(&self, remote_addr: SocketAddr) { + if self.expected_non_gateway.insert(remote_addr.ip()) { + tracing::debug!(%remote_addr, "registered expected inbound handshake from remote IP"); + } + } } /// Handles UDP transport internally. @@ -249,7 +268,8 @@ struct UdpPacketsListener { dropped_packets: HashMap, last_drop_warning: Instant, bandwidth_limit: Option, - expected_non_gateway: Arc>>, + expected_non_gateway: Arc>, + known_gateway_addrs: Option>>, } type OngoingConnection = ( @@ -416,17 +436,25 @@ impl UdpPacketsListener { } if !self.is_gateway { - let allow = self - .expected_non_gateway - .lock() - .map(|set| set.contains(&remote_addr.ip())) + let allow = self.expected_non_gateway.contains(&remote_addr.ip()); + let gateway_allow = self + .known_gateway_addrs + .as_ref() + .map(|set| set.contains(&remote_addr)) .unwrap_or(false); - if !allow { + if !allow && gateway_allow { tracing::debug!( + %remote_addr, + "allowing inbound handshake from known gateway without prior expectation" + ); + } + if !allow && !gateway_allow { + tracing::warn!( %remote_addr, %size, - "unexpected packet from non-gateway node" + "unexpected packet from non-gateway node; dropping intro packet" ); + self.expected_non_gateway.insert(remote_addr.ip()); continue; } } @@ -497,8 +525,15 @@ impl UdpPacketsListener { match res.expect("task shouldn't panic") { Ok((outbound_remote_conn, inbound_remote_connection)) => { if let Some((_, result_sender)) = ongoing_connections.remove(&outbound_remote_conn.remote_addr) { - if let Ok(mut set) = self.expected_non_gateway.lock() { - set.remove(&outbound_remote_conn.remote_addr.ip()); + if self + .expected_non_gateway + .remove(&outbound_remote_conn.remote_addr.ip()) + .is_some() + { + tracing::debug!( + remote_addr = %outbound_remote_conn.remote_addr, + "cleared expected handshake flag after successful connection" + ); } tracing::debug!(remote_addr = %outbound_remote_conn.remote_addr, "connection established"); self.remote_connections.insert(outbound_remote_conn.remote_addr, inbound_remote_connection); @@ -521,8 +556,12 @@ impl UdpPacketsListener { } } if let Some((_, result_sender)) = ongoing_connections.remove(&remote_addr) { - if let Ok(mut set) = self.expected_non_gateway.lock() { - set.remove(&remote_addr.ip()); + if self + .expected_non_gateway + .remove(&remote_addr.ip()) + .is_some() + { + tracing::debug!(%remote_addr, "cleared expected handshake flag after failed connection"); } let _ = result_sender.send(Err(error)); } @@ -570,9 +609,7 @@ impl UdpPacketsListener { remote_addr, remote_public_key.clone(), ); - if let Ok(mut set) = self.expected_non_gateway.lock() { - set.insert(remote_addr.ip()); - } + self.expected_non_gateway.insert(remote_addr.ip()); let task = tokio::spawn(ongoing_connection .map_err(move |err| (err, remote_addr)) .instrument(span!(tracing::Level::DEBUG, "traverse_nat")) @@ -713,14 +750,6 @@ impl UdpPacketsListener { %remote_addr, "Starting NAT traversal" ); - // Constants for exponential backoff - const INITIAL_TIMEOUT: Duration = Duration::from_millis(600); - const TIMEOUT_MULTIPLIER: f64 = 1.2; - #[cfg(not(test))] - const MAX_TIMEOUT: Duration = Duration::from_secs(60); // Maximum timeout limit - #[cfg(test)] - const MAX_TIMEOUT: Duration = Duration::from_secs(10); // Maximum timeout limit - #[allow(clippy::large_enum_variant)] enum ConnectionState { /// Initial state of the joiner @@ -770,12 +799,11 @@ impl UdpPacketsListener { let f = async move { tracing::info!(%remote_addr, "Starting outbound handshake (NAT traversal)"); let mut state = ConnectionState::StartOutbound {}; - // Initialize timeout and interval - let mut timeout = INITIAL_TIMEOUT; - let mut interval_duration = INITIAL_INTERVAL; - let mut tick = tokio::time::interval(interval_duration); - - let mut failures = 0; + let mut attempts = 0usize; + let start_time = Instant::now(); + let overall_deadline = Duration::from_secs(3); + let mut resend_tick = tokio::time::interval(INITIAL_INTERVAL); + resend_tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); let inbound_sym_key_bytes = rand::random::<[u8; 16]>(); let inbound_sym_key = Aes128Gcm::new(&inbound_sym_key_bytes.into()); @@ -790,7 +818,7 @@ impl UdpPacketsListener { let mut sent_tracker = SentPacketTracker::new(); - while failures < NAT_TRAVERSAL_MAX_ATTEMPTS { + while attempts < NAT_TRAVERSAL_MAX_ATTEMPTS && start_time.elapsed() < overall_deadline { match state { ConnectionState::StartOutbound => { tracing::debug!(%remote_addr, "sending protocol version and inbound key"); @@ -798,6 +826,7 @@ impl UdpPacketsListener { .send((remote_addr, outbound_intro_packet.data().into())) .await .map_err(|_| TransportError::ChannelClosed)?; + attempts += 1; } ConnectionState::RemoteInbound { .. } => { tracing::debug!(%remote_addr, "sending back protocol version and inbound key to remote"); @@ -816,7 +845,8 @@ impl UdpPacketsListener { ); } } - let next_inbound = tokio::time::timeout(timeout, next_inbound.recv()); + let next_inbound = + tokio::time::timeout(Duration::from_millis(200), next_inbound.recv()); match next_inbound.await { Ok(Some(packet)) => { tracing::debug!(%remote_addr, "received packet after sending it"); @@ -871,7 +901,6 @@ impl UdpPacketsListener { .map_err(|_| TransportError::ChannelClosed)?; let (inbound_sender, inbound_recv) = mpsc::channel(100); tracing::debug!(%remote_addr, "connection established"); - let attempts = failures + 1; tracing::info!(%remote_addr, attempts = attempts, "Outbound handshake completed (ack path)"); return Ok(( RemoteConnection { @@ -903,7 +932,6 @@ impl UdpPacketsListener { } _ => { tracing::debug!(%remote_addr, "unexpected packet from remote"); - failures += 1; continue; } } @@ -922,7 +950,6 @@ impl UdpPacketsListener { continue; } - failures += 1; tracing::debug!("Failed to decrypt packet"); continue; } @@ -935,13 +962,10 @@ impl UdpPacketsListener { // intro packet so we need to handle that if packet.is_intro_packet(intro_packet) { tracing::debug!(%remote_addr, "received intro packet"); - // we add to the number of failures so we are not stuck in a loop retrying - failures += 1; continue; } // if is not an intro packet, the connection is successful and we can proceed let (inbound_sender, inbound_recv) = mpsc::channel(100); - let attempts = failures + 1; tracing::info!(%remote_addr, attempts = attempts, "Outbound handshake completed (inbound ack path)"); return Ok(( RemoteConnection { @@ -972,40 +996,19 @@ impl UdpPacketsListener { return Err(TransportError::ConnectionClosed(remote_addr)); } Err(_) => { - failures += 1; tracing::debug!(%this_addr, %remote_addr, "failed to receive UDP response in time, retrying"); } } - // We have retried for a while, so return an error - if timeout >= MAX_TIMEOUT { - tracing::error!(%this_addr, %remote_addr, "failed to establish connection after multiple attempts, max timeout reached"); - break; - } - - // Update timeout using exponential backoff, capped at MAX_TIMEOUT - timeout = std::cmp::min( - Duration::from_millis( - ((timeout.as_millis()) as f64 * TIMEOUT_MULTIPLIER) as u64, - ), - MAX_TIMEOUT, - ); - - // Update interval, capped at MAX_INTERVAL - if interval_duration < MAX_INTERVAL { - interval_duration = std::cmp::min( - Duration::from_millis( - interval_duration.as_millis() as u64 * INTERVAL_INCREASE_FACTOR, - ), - MAX_INTERVAL, - ); - tick = tokio::time::interval(interval_duration); - } - - tick.tick().await; + resend_tick.tick().await; } - tracing::warn!(%remote_addr, attempts = failures, "Outbound handshake failed: max connection attempts reached"); + tracing::warn!( + %remote_addr, + attempts, + elapsed_ms = start_time.elapsed().as_millis(), + "Outbound handshake failed: max connection attempts reached" + ); Err(TransportError::ConnectionEstablishmentFailure { cause: "max connection attempts reached".into(), }) diff --git a/crates/core/src/transport/peer_connection.rs b/crates/core/src/transport/peer_connection.rs index e994a8b99..e96447dab 100644 --- a/crates/core/src/transport/peer_connection.rs +++ b/crates/core/src/transport/peer_connection.rs @@ -335,7 +335,7 @@ impl PeerConnection { // listen for incoming messages or receipts or wait until is time to do anything else again let mut resend_check = Some(tokio::time::sleep(tokio::time::Duration::from_millis(10))); - const KILL_CONNECTION_AFTER: Duration = Duration::from_secs(30); + const KILL_CONNECTION_AFTER: Duration = Duration::from_secs(120); let mut last_received = std::time::Instant::now(); // Check for timeout periodically diff --git a/crates/freenet-macros/src/codegen.rs b/crates/freenet-macros/src/codegen.rs index 7f470f566..d2ae2ecee 100644 --- a/crates/freenet-macros/src/codegen.rs +++ b/crates/freenet-macros/src/codegen.rs @@ -62,9 +62,9 @@ pub fn generate_test_code(args: FreenetTestArgs, input_fn: ItemFn) -> Result Date: Thu, 6 Nov 2025 00:33:08 +0100 Subject: [PATCH 11/26] feat(connect): add connect_v2 scaffolding --- crates/core/src/operations/connect_v2.rs | 158 +++++++++++++++++++++++ crates/core/src/operations/mod.rs | 2 + 2 files changed, 160 insertions(+) create mode 100644 crates/core/src/operations/connect_v2.rs diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs new file mode 100644 index 000000000..9b7cbbe41 --- /dev/null +++ b/crates/core/src/operations/connect_v2.rs @@ -0,0 +1,158 @@ +//! Prototype implementation of the simplified two-message connect flow. +//! +//! This module is *not yet wired in*. It exists so we can develop the new +//! handshake logic incrementally while keeping the current connect operation +//! intact. Once fully implemented we will switch the node to use this module +//! and delete the legacy code. + +#![allow(dead_code)] + +use std::collections::HashSet; +use std::net::SocketAddr; +use std::time::Instant; + +use serde::{Deserialize, Serialize}; + +use crate::dev_tool::Location; +use crate::message::Transaction; +use crate::ring::PeerKeyLocation; +use crate::util::Backoff; + +/// Top-level message envelope used by the new connect handshake. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) enum ConnectMsgV2 { + /// Join request that travels *towards* the target location. + Request { + id: Transaction, + from: PeerKeyLocation, + target: PeerKeyLocation, + payload: ConnectRequest, + }, + /// Join acceptance that travels back along the discovered path. + Response { + id: Transaction, + sender: PeerKeyLocation, + target: PeerKeyLocation, + payload: ConnectResponse, + }, + /// Informational packet letting the joiner know the address a peer observed. + ObservedAddress { + id: Transaction, + target: PeerKeyLocation, + address: SocketAddr, + }, +} + +/// Two-message request payload. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(crate) struct ConnectRequest { + /// Joiner's advertised location (fallbacks to the joiner's socket address). + pub desired_location: Location, + /// Joiner's identity as observed so far. + pub origin: PeerKeyLocation, + /// Remaining hops before the request stops travelling. + pub ttl: u8, + /// Simple visited set to avoid trivial loops. + pub visited: Vec, +} + +/// Acceptance payload returned by candidates. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(crate) struct ConnectResponse { + /// The peer that accepted the join request. + pub acceptor: PeerKeyLocation, + /// Whether this acceptance is a short-lived courtesy link. + pub courtesy: bool, +} + +/// New minimal state machine the joiner tracks. +#[derive(Debug)] +pub(crate) enum ConnectState { + /// Joiner waiting for acceptances. + WaitingForResponses(JoinerState), + /// Intermediate peer evaluating and forwarding requests. + Relaying(RelayState), + /// Joiner obtained the required neighbours. + Completed, +} + +#[derive(Debug)] +pub(crate) struct JoinerState { + pub desired_location: Location, + pub target_connections: usize, + pub observed_address: Option, + pub accepted: HashSet, + pub last_progress: Instant, +} + +#[derive(Debug)] +pub(crate) struct RelayState { + pub upstream: PeerKeyLocation, + pub request: ConnectRequest, + pub forwarded_to: Option, + pub courtesy_hint: bool, + pub observed_sent: bool, + pub accepted_locally: bool, +} + +/// Placeholder operation wrapper so we can exercise the logic in isolation in +/// forthcoming commits. For now this simply captures the shared state we will +/// migrate to. +#[derive(Debug)] +pub(crate) struct ConnectOpV2 { + pub(crate) id: Transaction, + pub(crate) state: Option, + pub(crate) gateway: Option>, + pub(crate) backoff: Option, +} + +impl ConnectOpV2 { + #[allow(clippy::too_many_arguments)] + pub(crate) fn new_joiner( + id: Transaction, + desired_location: Location, + target_connections: usize, + observed_address: Option, + gateway: Option, + backoff: Option, + ) -> Self { + let state = ConnectState::WaitingForResponses(JoinerState { + desired_location, + target_connections, + observed_address, + accepted: HashSet::new(), + last_progress: Instant::now(), + }); + Self { + id, + state: Some(state), + gateway: gateway.map(Box::new), + backoff, + } + } + + pub(crate) fn new_relay( + id: Transaction, + upstream: PeerKeyLocation, + request: ConnectRequest, + ) -> Self { + let state = ConnectState::Relaying(RelayState { + upstream, + request, + forwarded_to: None, + courtesy_hint: false, + observed_sent: false, + accepted_locally: false, + }); + Self { + id, + state: Some(state), + gateway: None, + backoff: None, + } + } + + pub(crate) fn is_completed(&self) -> bool { + matches!(self.state, Some(ConnectState::Completed)) + } +} diff --git a/crates/core/src/operations/mod.rs b/crates/core/src/operations/mod.rs index 9af80bc84..70b1af3bd 100644 --- a/crates/core/src/operations/mod.rs +++ b/crates/core/src/operations/mod.rs @@ -15,6 +15,8 @@ use crate::{ }; pub(crate) mod connect; +#[allow(dead_code)] +pub(crate) mod connect_v2; pub(crate) mod get; pub(crate) mod put; pub(crate) mod subscribe; From e7d5c5c9d929954c78212ebe1cf5c5d9616b6f04 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 00:36:19 +0100 Subject: [PATCH 12/26] feat(connect): add joiner and relay logic skeleton --- crates/core/src/operations/connect_v2.rs | 133 ++++++++++++++++++++++- 1 file changed, 128 insertions(+), 5 deletions(-) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 9b7cbbe41..702fae9f0 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -5,8 +5,6 @@ //! intact. Once fully implemented we will switch the node to use this module //! and delete the legacy code. -#![allow(dead_code)] - use std::collections::HashSet; use std::net::SocketAddr; use std::time::Instant; @@ -71,7 +69,7 @@ pub(crate) enum ConnectState { /// Joiner waiting for acceptances. WaitingForResponses(JoinerState), /// Intermediate peer evaluating and forwarding requests. - Relaying(RelayState), + Relaying(Box), /// Joiner obtained the required neighbours. Completed, } @@ -95,6 +93,124 @@ pub(crate) struct RelayState { pub accepted_locally: bool, } +/// Abstractions required to evaluate an inbound connect request at an +/// intermediate peer. +pub(crate) trait RelayContext { + /// Location of the current peer. + fn self_location(&self) -> &PeerKeyLocation; + + /// Determine whether we should accept the joiner immediately. + fn should_accept(&self, joiner: &PeerKeyLocation) -> bool; + + /// Choose the next hop for the request, avoiding peers already visited. + fn select_next_hop( + &self, + desired_location: Location, + visited: &[PeerKeyLocation], + ) -> Option; + + /// Whether the acceptance should be treated as a short-lived courtesy link. + fn courtesy_hint(&self, acceptor: &PeerKeyLocation, joiner: &PeerKeyLocation) -> bool; +} + +/// Result of processing a request at a relay. +#[derive(Debug, Default)] +pub(crate) struct RelayActions { + pub accept_response: Option, + pub expect_connection_from: Option, + pub forward: Option<(PeerKeyLocation, ConnectRequest)>, + pub observed_address: Option<(PeerKeyLocation, SocketAddr)>, +} + +impl RelayState { + pub(crate) fn handle_request( + &mut self, + ctx: &C, + observed_remote: &PeerKeyLocation, + observed_addr: SocketAddr, + ) -> RelayActions { + let mut actions = RelayActions::default(); + push_unique_peer(&mut self.request.visited, observed_remote.clone()); + push_unique_peer(&mut self.request.visited, ctx.self_location().clone()); + + if self.request.origin.peer.addr.ip().is_unspecified() + && !self.observed_sent + && observed_remote.peer.pub_key == self.request.origin.peer.pub_key + { + self.request.origin.peer.addr = observed_addr; + if self.request.origin.location.is_none() { + self.request.origin.location = Some(Location::from_address(&observed_addr)); + } + self.observed_sent = true; + actions.observed_address = Some((self.request.origin.clone(), observed_addr)); + } + + if !self.accepted_locally && ctx.should_accept(&self.request.origin) { + self.accepted_locally = true; + let acceptor = ctx.self_location().clone(); + let courtesy = ctx.courtesy_hint(&acceptor, &self.request.origin); + self.courtesy_hint = courtesy; + actions.accept_response = Some(ConnectResponse { + acceptor: acceptor.clone(), + courtesy, + }); + actions.expect_connection_from = Some(self.request.origin.clone()); + } + + if self.forwarded_to.is_none() && self.request.ttl > 0 { + if let Some(next) = + ctx.select_next_hop(self.request.desired_location, &self.request.visited) + { + let mut forward_req = self.request.clone(); + forward_req.ttl = forward_req.ttl.saturating_sub(1); + push_unique_peer(&mut forward_req.visited, ctx.self_location().clone()); + let forward_snapshot = forward_req.clone(); + self.forwarded_to = Some(next.clone()); + self.request = forward_req; + actions.forward = Some((next, forward_snapshot)); + } + } + + actions + } +} + +#[derive(Debug)] +pub struct AcceptedPeer { + pub peer: PeerKeyLocation, + pub courtesy: bool, +} + +#[derive(Debug, Default)] +pub struct JoinerAcceptance { + pub new_acceptor: Option, + pub satisfied: bool, +} + +impl JoinerState { + pub(crate) fn register_acceptance( + &mut self, + response: &ConnectResponse, + now: Instant, + ) -> JoinerAcceptance { + let mut acceptance = JoinerAcceptance::default(); + if self.accepted.insert(response.acceptor.clone()) { + self.last_progress = now; + acceptance.new_acceptor = Some(AcceptedPeer { + peer: response.acceptor.clone(), + courtesy: response.courtesy, + }); + } + acceptance.satisfied = self.accepted.len() >= self.target_connections; + acceptance + } + + pub(crate) fn update_observed_address(&mut self, address: SocketAddr, now: Instant) { + self.observed_address = Some(address); + self.last_progress = now; + } +} + /// Placeholder operation wrapper so we can exercise the logic in isolation in /// forthcoming commits. For now this simply captures the shared state we will /// migrate to. @@ -136,14 +252,14 @@ impl ConnectOpV2 { upstream: PeerKeyLocation, request: ConnectRequest, ) -> Self { - let state = ConnectState::Relaying(RelayState { + let state = ConnectState::Relaying(Box::new(RelayState { upstream, request, forwarded_to: None, courtesy_hint: false, observed_sent: false, accepted_locally: false, - }); + })); Self { id, state: Some(state), @@ -156,3 +272,10 @@ impl ConnectOpV2 { matches!(self.state, Some(ConnectState::Completed)) } } + +fn push_unique_peer(list: &mut Vec, peer: PeerKeyLocation) { + let already_present = list.iter().any(|p| p.peer == peer.peer); + if !already_present { + list.push(peer); + } +} From 9a558cdd14c03905328f31f95df8bbbbf07a1f4b Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 00:38:46 +0100 Subject: [PATCH 13/26] feat(connect): expose v2 handler entry points --- crates/core/src/operations/connect_v2.rs | 52 ++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 702fae9f0..9a256fb7f 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -271,6 +271,58 @@ impl ConnectOpV2 { pub(crate) fn is_completed(&self) -> bool { matches!(self.state, Some(ConnectState::Completed)) } + + pub(crate) fn handle_response( + &mut self, + response: &ConnectResponse, + now: Instant, + ) -> Option { + match self.state.as_mut() { + Some(ConnectState::WaitingForResponses(state)) => { + let result = state.register_acceptance(response, now); + if result.satisfied { + self.state = Some(ConnectState::Completed); + } + Some(result) + } + _ => None, + } + } + + pub(crate) fn handle_observed_address(&mut self, address: SocketAddr, now: Instant) { + if let Some(ConnectState::WaitingForResponses(state)) = self.state.as_mut() { + state.update_observed_address(address, now); + } + } + + pub(crate) fn handle_request( + &mut self, + ctx: &C, + upstream: PeerKeyLocation, + request: ConnectRequest, + observed_addr: SocketAddr, + ) -> RelayActions { + if !matches!(self.state, Some(ConnectState::Relaying(_))) { + self.state = Some(ConnectState::Relaying(Box::new(RelayState { + upstream: upstream.clone(), + request: request.clone(), + forwarded_to: None, + courtesy_hint: false, + observed_sent: false, + accepted_locally: false, + }))); + } + + match self.state.as_mut() { + Some(ConnectState::Relaying(state)) => { + state.upstream = upstream; + state.request = request; + let upstream_snapshot = state.upstream.clone(); + state.handle_request(ctx, &upstream_snapshot, observed_addr) + } + _ => RelayActions::default(), + } + } } fn push_unique_peer(list: &mut Vec, peer: PeerKeyLocation) { From 17e7fb62076b22661bc7f241abec4f6850084648 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 00:40:37 +0100 Subject: [PATCH 14/26] feat(connect): add relay context adapter --- crates/core/src/operations/connect_v2.rs | 67 +++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 9a256fb7f..484fb70d9 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -13,8 +13,9 @@ use serde::{Deserialize, Serialize}; use crate::dev_tool::Location; use crate::message::Transaction; +use crate::node::{OpManager, PeerId}; use crate::ring::PeerKeyLocation; -use crate::util::Backoff; +use crate::util::{Backoff, Contains}; /// Top-level message envelope used by the new connect handshake. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -175,6 +176,54 @@ impl RelayState { } } +pub(crate) struct RelayEnv<'a> { + pub op_manager: &'a OpManager, + self_location: PeerKeyLocation, +} + +impl<'a> RelayEnv<'a> { + pub fn new(op_manager: &'a OpManager) -> Self { + let self_location = op_manager.ring.connection_manager.own_location(); + Self { + op_manager, + self_location, + } + } +} + +impl RelayContext for RelayEnv<'_> { + fn self_location(&self) -> &PeerKeyLocation { + &self.self_location + } + + fn should_accept(&self, joiner: &PeerKeyLocation) -> bool { + let location = joiner + .location + .unwrap_or_else(|| Location::from_address(&joiner.peer.addr)); + self.op_manager + .ring + .connection_manager + .should_accept(location, &joiner.peer) + } + + fn select_next_hop( + &self, + desired_location: Location, + visited: &[PeerKeyLocation], + ) -> Option { + let skip = VisitedPeerIds { peers: visited }; + let router = self.op_manager.ring.router.read(); + self.op_manager + .ring + .connection_manager + .routing(desired_location, None, skip, &router) + } + + fn courtesy_hint(&self, _acceptor: &PeerKeyLocation, _joiner: &PeerKeyLocation) -> bool { + self.op_manager.ring.open_connections() == 0 + } +} + #[derive(Debug)] pub struct AcceptedPeer { pub peer: PeerKeyLocation, @@ -325,6 +374,22 @@ impl ConnectOpV2 { } } +struct VisitedPeerIds<'a> { + peers: &'a [PeerKeyLocation], +} + +impl Contains for VisitedPeerIds<'_> { + fn has_element(&self, target: PeerId) -> bool { + self.peers.iter().any(|p| p.peer == target) + } +} + +impl Contains<&PeerId> for VisitedPeerIds<'_> { + fn has_element(&self, target: &PeerId) -> bool { + self.peers.iter().any(|p| &p.peer == target) + } +} + fn push_unique_peer(list: &mut Vec, peer: PeerKeyLocation) { let already_present = list.iter().any(|p| p.peer == peer.peer); if !already_present { From 4c9db2566b0e9af61af16e8c78efaea78693069f Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 00:45:20 +0100 Subject: [PATCH 15/26] test(connect): cover v2 helper behaviours --- crates/core/src/operations/connect_v2.rs | 155 +++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 484fb70d9..31d9462a5 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -396,3 +396,158 @@ fn push_unique_peer(list: &mut Vec, peer: PeerKeyLocation) { list.push(peer); } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::node::PeerId; + use crate::transport::TransportKeypair; + use std::net::{IpAddr, Ipv4Addr, SocketAddr}; + use std::time::Instant; + + struct TestRelayContext { + self_loc: PeerKeyLocation, + accept: bool, + next_hop: Option, + courtesy: bool, + } + + impl TestRelayContext { + fn new(self_loc: PeerKeyLocation) -> Self { + Self { + self_loc, + accept: true, + next_hop: None, + courtesy: false, + } + } + + fn accept(mut self, accept: bool) -> Self { + self.accept = accept; + self + } + + fn next_hop(mut self, hop: Option) -> Self { + self.next_hop = hop; + self + } + + fn courtesy(mut self, courtesy: bool) -> Self { + self.courtesy = courtesy; + self + } + } + + impl RelayContext for TestRelayContext { + fn self_location(&self) -> &PeerKeyLocation { + &self.self_loc + } + + fn should_accept(&self, _joiner: &PeerKeyLocation) -> bool { + self.accept + } + + fn select_next_hop( + &self, + _desired_location: Location, + _visited: &[PeerKeyLocation], + ) -> Option { + self.next_hop.clone() + } + + fn courtesy_hint(&self, _acceptor: &PeerKeyLocation, _joiner: &PeerKeyLocation) -> bool { + self.courtesy + } + } + + fn make_peer(port: u16) -> PeerKeyLocation { + let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port); + let keypair = TransportKeypair::new(); + PeerKeyLocation { + peer: PeerId::new(addr, keypair.public().clone()), + location: Some(Location::random()), + } + } + + #[test] + fn relay_accepts_when_policy_allows() { + let self_loc = make_peer(4000); + let joiner = make_peer(5000); + let mut state = RelayState { + upstream: joiner.clone(), + request: ConnectRequest { + desired_location: Location::random(), + origin: joiner.clone(), + ttl: 3, + visited: vec![], + }, + forwarded_to: None, + courtesy_hint: false, + observed_sent: false, + accepted_locally: false, + }; + + let ctx = TestRelayContext::new(self_loc.clone()).courtesy(true); + let observed_addr = joiner.peer.addr; + let actions = state.handle_request(&ctx, &joiner, observed_addr); + + let response = actions.accept_response.expect("expected acceptance"); + assert_eq!(response.acceptor.peer, self_loc.peer); + assert!(response.courtesy); + assert_eq!(actions.expect_connection_from.unwrap().peer, joiner.peer); + assert!(actions.forward.is_none()); + } + + #[test] + fn relay_forwards_when_not_accepting() { + let self_loc = make_peer(4100); + let joiner = make_peer(5100); + let next_hop = make_peer(6100); + let mut state = RelayState { + upstream: joiner.clone(), + request: ConnectRequest { + desired_location: Location::random(), + origin: joiner.clone(), + ttl: 2, + visited: vec![], + }, + forwarded_to: None, + courtesy_hint: false, + observed_sent: false, + accepted_locally: false, + }; + + let ctx = TestRelayContext::new(self_loc) + .accept(false) + .next_hop(Some(next_hop.clone())); + let actions = state.handle_request(&ctx, &joiner, joiner.peer.addr); + + assert!(actions.accept_response.is_none()); + let (forward_to, request) = actions.forward.expect("expected forward"); + assert_eq!(forward_to.peer, next_hop.peer); + assert_eq!(request.ttl, 1); + assert!(request.visited.iter().any(|pkl| pkl.peer == joiner.peer)); + } + + #[test] + fn joiner_tracks_acceptance() { + let acceptor = make_peer(7000); + let mut state = JoinerState { + desired_location: Location::random(), + target_connections: 1, + observed_address: None, + accepted: HashSet::new(), + last_progress: Instant::now(), + }; + + let response = ConnectResponse { + acceptor: acceptor.clone(), + courtesy: false, + }; + let result = state.register_acceptance(&response, Instant::now()); + assert!(result.satisfied); + let new = result.new_acceptor.expect("expected new acceptor"); + assert_eq!(new.peer.peer, acceptor.peer); + assert!(!new.courtesy); + } +} From b8ac88f7f772a1e3a35eccf4a98d82a0fd2707ec Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 00:51:25 +0100 Subject: [PATCH 16/26] feat(connect): add net message support for v2 handshake --- crates/core/src/message.rs | 22 +++++++++- crates/core/src/operations/connect_v2.rs | 52 +++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/crates/core/src/message.rs b/crates/core/src/message.rs index 4080bff20..264011dfb 100644 --- a/crates/core/src/message.rs +++ b/crates/core/src/message.rs @@ -12,7 +12,8 @@ use crate::{ client_events::{ClientId, HostResult}, node::PeerId, operations::{ - connect::ConnectMsg, get::GetMsg, put::PutMsg, subscribe::SubscribeMsg, update::UpdateMsg, + connect::ConnectMsg, connect_v2::ConnectMsgV2, get::GetMsg, put::PutMsg, + subscribe::SubscribeMsg, update::UpdateMsg, }, ring::{Location, PeerKeyLocation}, }; @@ -193,11 +194,18 @@ where mod sealed_msg_type { use super::*; + use crate::operations::connect_v2::ConnectMsgV2; pub trait SealedTxType { fn tx_type_id() -> TransactionTypeId; } + impl SealedTxType for ConnectMsgV2 { + fn tx_type_id() -> TransactionTypeId { + TransactionTypeId(TransactionType::Connect) + } + } + #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] #[cfg_attr(test, derive(arbitrary::Arbitrary))] pub struct TransactionTypeId(pub(super) TransactionType); @@ -258,6 +266,12 @@ mod sealed_msg_type { }); } +impl From for NetMessage { + fn from(msg: ConnectMsgV2) -> Self { + NetMessage::V1(NetMessageV1::ConnectV2(msg)) + } +} + pub(crate) trait MessageStats { fn id(&self) -> &Transaction; @@ -274,6 +288,7 @@ pub(crate) enum NetMessage { #[derive(Debug, Serialize, Deserialize, Clone)] pub(crate) enum NetMessageV1 { Connect(ConnectMsg), + ConnectV2(ConnectMsgV2), Put(PutMsg), Get(GetMsg), Subscribe(SubscribeMsg), @@ -302,6 +317,7 @@ impl Versioned for NetMessageV1 { fn version(&self) -> semver::Version { match self { NetMessageV1::Connect(_) => semver::Version::new(1, 0, 0), + NetMessageV1::ConnectV2(_) => semver::Version::new(1, 1, 0), NetMessageV1::Put(_) => semver::Version::new(1, 0, 0), NetMessageV1::Get(_) => semver::Version::new(1, 0, 0), NetMessageV1::Subscribe(_) => semver::Version::new(1, 0, 0), @@ -482,6 +498,7 @@ impl MessageStats for NetMessageV1 { fn id(&self) -> &Transaction { match self { NetMessageV1::Connect(op) => op.id(), + NetMessageV1::ConnectV2(op) => op.id(), NetMessageV1::Put(op) => op.id(), NetMessageV1::Get(op) => op.id(), NetMessageV1::Subscribe(op) => op.id(), @@ -494,6 +511,7 @@ impl MessageStats for NetMessageV1 { fn target(&self) -> Option { match self { NetMessageV1::Connect(op) => op.target().as_ref().map(|b| b.borrow().clone()), + NetMessageV1::ConnectV2(op) => op.target().cloned(), NetMessageV1::Put(op) => op.target().as_ref().map(|b| b.borrow().clone()), NetMessageV1::Get(op) => op.target().as_ref().map(|b| b.borrow().clone()), NetMessageV1::Subscribe(op) => op.target().as_ref().map(|b| b.borrow().clone()), @@ -506,6 +524,7 @@ impl MessageStats for NetMessageV1 { fn requested_location(&self) -> Option { match self { NetMessageV1::Connect(op) => op.requested_location(), + NetMessageV1::ConnectV2(op) => op.requested_location(), NetMessageV1::Put(op) => op.requested_location(), NetMessageV1::Get(op) => op.requested_location(), NetMessageV1::Subscribe(op) => op.requested_location(), @@ -523,6 +542,7 @@ impl Display for NetMessage { match self { NetMessage::V1(msg) => match msg { Connect(msg) => msg.fmt(f)?, + ConnectV2(msg) => msg.fmt(f)?, Put(msg) => msg.fmt(f)?, Get(msg) => msg.fmt(f)?, Subscribe(msg) => msg.fmt(f)?, diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 31d9462a5..ccae9eddc 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -6,13 +6,14 @@ //! and delete the legacy code. use std::collections::HashSet; +use std::fmt; use std::net::SocketAddr; use std::time::Instant; use serde::{Deserialize, Serialize}; use crate::dev_tool::Location; -use crate::message::Transaction; +use crate::message::{InnerMessage, Transaction}; use crate::node::{OpManager, PeerId}; use crate::ring::PeerKeyLocation; use crate::util::{Backoff, Contains}; @@ -42,6 +43,55 @@ pub(crate) enum ConnectMsgV2 { }, } +impl InnerMessage for ConnectMsgV2 { + fn id(&self) -> &Transaction { + match self { + ConnectMsgV2::Request { id, .. } + | ConnectMsgV2::Response { id, .. } + | ConnectMsgV2::ObservedAddress { id, .. } => id, + } + } + + #[allow(refining_impl_trait)] + fn target(&self) -> Option<&PeerKeyLocation> { + match self { + ConnectMsgV2::Request { target, .. } + | ConnectMsgV2::Response { target, .. } + | ConnectMsgV2::ObservedAddress { target, .. } => Some(target), + } + } + + fn requested_location(&self) -> Option { + match self { + ConnectMsgV2::Request { payload, .. } => Some(payload.desired_location), + _ => None, + } + } +} + +impl fmt::Display for ConnectMsgV2 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ConnectMsgV2::Request { target, payload, .. } => write!( + f, + "ConnectRequest {{ target: {target}, desired: {}, ttl: {}, origin: {} }}", + payload.desired_location, + payload.ttl, + payload.origin + ), + ConnectMsgV2::Response { sender, target, payload, .. } => write!( + f, + "ConnectResponse {{ sender: {sender}, target: {target}, acceptor: {}, courtesy: {} }}", + payload.acceptor, + payload.courtesy + ), + ConnectMsgV2::ObservedAddress { target, address, .. } => { + write!(f, "ObservedAddress {{ target: {target}, address: {address} }}") + } + } + } +} + /// Two-message request payload. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct ConnectRequest { From 0bd9ce8b93df5813cd0587437cc77ca04f689279 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 01:14:01 +0100 Subject: [PATCH 17/26] feat(connect): provide joiner request helper --- crates/core/src/operations/connect_v2.rs | 36 ++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index ccae9eddc..f7eeed72d 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -371,6 +371,42 @@ impl ConnectOpV2 { matches!(self.state, Some(ConnectState::Completed)) } + pub(crate) fn initiate_join_request( + op_manager: &OpManager, + target: PeerKeyLocation, + desired_location: Location, + ttl: u8, + ) -> (Transaction, Self, ConnectMsgV2) { + let own = op_manager.ring.connection_manager.own_location(); + let mut visited = vec![own.clone()]; + push_unique_peer(&mut visited, target.clone()); + let request = ConnectRequest { + desired_location, + origin: own.clone(), + ttl, + visited, + }; + + let tx = Transaction::new::(); + let op = ConnectOpV2::new_joiner( + tx, + desired_location, + op_manager.ring.connection_manager.min_connections, + Some(own.peer.addr), + Some(target.clone()), + None, + ); + + let msg = ConnectMsgV2::Request { + id: tx, + from: own, + target, + payload: request, + }; + + (tx, op, msg) + } + pub(crate) fn handle_response( &mut self, response: &ConnectResponse, From 3685adac2c4790b51f2d395fdda3959687467cab Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 01:17:42 +0100 Subject: [PATCH 18/26] test(connect): validate join request builder --- crates/core/src/operations/connect_v2.rs | 34 +++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index f7eeed72d..6a6e9311b 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -372,12 +372,12 @@ impl ConnectOpV2 { } pub(crate) fn initiate_join_request( - op_manager: &OpManager, + own: PeerKeyLocation, target: PeerKeyLocation, desired_location: Location, ttl: u8, + target_connections: usize, ) -> (Transaction, Self, ConnectMsgV2) { - let own = op_manager.ring.connection_manager.own_location(); let mut visited = vec![own.clone()]; push_unique_peer(&mut visited, target.clone()); let request = ConnectRequest { @@ -391,7 +391,7 @@ impl ConnectOpV2 { let op = ConnectOpV2::new_joiner( tx, desired_location, - op_manager.ring.connection_manager.min_connections, + target_connections, Some(own.peer.addr), Some(target.clone()), None, @@ -636,4 +636,32 @@ mod tests { assert_eq!(new.peer.peer, acceptor.peer); assert!(!new.courtesy); } + + #[test] + fn init_join_request_initializes_state() { + let target = make_peer(7200); + let desired = Location::random(); + let ttl = 5; + let own = make_peer(7300); + let (_tx, op, msg) = ConnectOpV2::initiate_join_request( + own.clone(), + target.clone(), + desired, + ttl, + 2, + ); + + match msg { + ConnectMsgV2::Request { from, target: msg_target, payload, .. } => { + assert_eq!(msg_target.peer, target.peer); + assert_eq!(payload.desired_location, desired); + assert_eq!(payload.ttl, ttl); + assert!(payload.visited.iter().any(|p| p.peer == from.peer)); + assert!(payload.visited.iter().any(|p| p.peer == target.peer)); + } + other => panic!("unexpected message: {other:?}"), + } + + assert!(matches!(op.state, Some(ConnectState::WaitingForResponses(_)))); + } } From 512c236fce3ef720d16c6a507538094bf01e42fc Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 01:56:03 +0100 Subject: [PATCH 19/26] feat(connect): wire connect_v2 operation scaffolding --- crates/core/src/node/mod.rs | 20 ++ crates/core/src/node/op_state_manager.rs | 17 +- crates/core/src/operations/connect_v2.rs | 246 +++++++++++++++++++++-- crates/core/src/operations/mod.rs | 2 + crates/core/src/util/mod.rs | 4 +- 5 files changed, 270 insertions(+), 19 deletions(-) diff --git a/crates/core/src/node/mod.rs b/crates/core/src/node/mod.rs index c50ac8be1..bd73ed845 100644 --- a/crates/core/src/node/mod.rs +++ b/crates/core/src/node/mod.rs @@ -1167,6 +1167,17 @@ async fn handle_aborted_op( connect::join_ring_request(backoff, &gateway, op_manager).await?; } } + Ok(Some(OpEnum::ConnectV2(op))) + if op.has_backoff() + && op_manager.ring.open_connections() + < op_manager.ring.connection_manager.min_connections => + { + let gateway = op.gateway().cloned(); + if let Some(gateway) = gateway { + tracing::warn!("Retry connecting to gateway {}", gateway.peer); + connect::join_ring_request(None, &gateway, op_manager).await?; + } + } Ok(Some(OpEnum::Connect(_))) => { // if no connections were achieved just fail if op_manager.ring.open_connections() == 0 && op_manager.ring.is_gateway() { @@ -1176,6 +1187,14 @@ async fn handle_aborted_op( } } } + Ok(Some(OpEnum::ConnectV2(_))) => { + if op_manager.ring.open_connections() == 0 && op_manager.ring.is_gateway() { + tracing::warn!("Retrying joining the ring with an other gateway"); + if let Some(gateway) = gateways.iter().shuffle().next() { + connect::join_ring_request(None, gateway, op_manager).await? + } + } + } _ => {} } } @@ -1461,6 +1480,7 @@ impl IsOperationCompleted for OpEnum { fn is_completed(&self) -> bool { match self { OpEnum::Connect(op) => op.is_completed(), + OpEnum::ConnectV2(op) => op.is_completed(), OpEnum::Put(op) => op.is_completed(), OpEnum::Get(op) => op.is_completed(), OpEnum::Subscribe(op) => op.is_completed(), diff --git a/crates/core/src/node/op_state_manager.rs b/crates/core/src/node/op_state_manager.rs index cd91e3705..df8b04fa4 100644 --- a/crates/core/src/node/op_state_manager.rs +++ b/crates/core/src/node/op_state_manager.rs @@ -187,6 +187,7 @@ impl SubOperationTracker { #[derive(Default)] struct Ops { connect: DashMap, + connect_v2: DashMap, put: DashMap, get: DashMap, subscribe: DashMap, @@ -434,6 +435,11 @@ impl OpManager { check_id_op!(id.transaction_type(), TransactionType::Connect); self.ops.connect.insert(id, *op); } + OpEnum::ConnectV2(op) => { + #[cfg(debug_assertions)] + check_id_op!(id.transaction_type(), TransactionType::Connect); + self.ops.connect_v2.insert(id, *op); + } OpEnum::Put(op) => { #[cfg(debug_assertions)] check_id_op!(id.transaction_type(), TransactionType::Put); @@ -472,10 +478,17 @@ impl OpManager { let op = match id.transaction_type() { TransactionType::Connect => self .ops - .connect + .connect_v2 .remove(id) .map(|(_k, v)| v) - .map(|op| OpEnum::Connect(Box::new(op))), + .map(|op| OpEnum::ConnectV2(Box::new(op))) + .or_else(|| { + self.ops + .connect + .remove(id) + .map(|(_k, v)| v) + .map(|op| OpEnum::Connect(Box::new(op))) + }), TransactionType::Put => self.ops.put.remove(id).map(|(_k, v)| v).map(OpEnum::Put), TransactionType::Get => self.ops.get.remove(id).map(|(_k, v)| v).map(OpEnum::Get), TransactionType::Subscribe => self diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 6a6e9311b..ecb0966da 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -11,12 +11,16 @@ use std::net::SocketAddr; use std::time::Instant; use serde::{Deserialize, Serialize}; +use tokio::sync::mpsc; +use crate::client_events::HostResult; use crate::dev_tool::Location; -use crate::message::{InnerMessage, Transaction}; -use crate::node::{OpManager, PeerId}; +use crate::message::{InnerMessage, NetMessage, NetMessageV1, NodeEvent, Transaction}; +use crate::node::{IsOperationCompleted, NetworkBridge, OpManager, PeerId}; +use crate::operations::{OpEnum, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; use crate::ring::PeerKeyLocation; use crate::util::{Backoff, Contains}; +use freenet_stdlib::client_api::HostResponse; /// Top-level message envelope used by the new connect handshake. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -92,6 +96,15 @@ impl fmt::Display for ConnectMsgV2 { } } +impl ConnectMsgV2 { + pub fn sender(&self) -> Option<&PeerId> { + match self { + ConnectMsgV2::Response { sender, .. } => Some(&sender.peer), + _ => None, + } + } +} + /// Two-message request payload. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct ConnectRequest { @@ -115,7 +128,7 @@ pub(crate) struct ConnectResponse { } /// New minimal state machine the joiner tracks. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) enum ConnectState { /// Joiner waiting for acceptances. WaitingForResponses(JoinerState), @@ -125,7 +138,7 @@ pub(crate) enum ConnectState { Completed, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct JoinerState { pub desired_location: Location, pub target_connections: usize, @@ -134,7 +147,7 @@ pub(crate) struct JoinerState { pub last_progress: Instant, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct RelayState { pub upstream: PeerKeyLocation, pub request: ConnectRequest, @@ -313,7 +326,7 @@ impl JoinerState { /// Placeholder operation wrapper so we can exercise the logic in isolation in /// forthcoming commits. For now this simply captures the shared state we will /// migrate to. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct ConnectOpV2 { pub(crate) id: Transaction, pub(crate) state: Option, @@ -371,6 +384,30 @@ impl ConnectOpV2 { matches!(self.state, Some(ConnectState::Completed)) } + pub(crate) fn id(&self) -> &Transaction { + &self.id + } + + pub(crate) fn outcome(&self) -> OpOutcome<'_> { + OpOutcome::Irrelevant + } + + pub(crate) fn finalized(&self) -> bool { + self.is_completed() + } + + pub(crate) fn to_host_result(&self) -> HostResult { + Ok(HostResponse::Ok) + } + + pub(crate) fn has_backoff(&self) -> bool { + self.backoff.is_some() + } + + pub(crate) fn gateway(&self) -> Option<&PeerKeyLocation> { + self.gateway.as_deref() + } + pub(crate) fn initiate_join_request( own: PeerKeyLocation, target: PeerKeyLocation, @@ -460,6 +497,160 @@ impl ConnectOpV2 { } } +impl IsOperationCompleted for ConnectOpV2 { + fn is_completed(&self) -> bool { + self.is_completed() + } +} + +impl Operation for ConnectOpV2 { + type Message = ConnectMsgV2; + type Result = (); + + fn id(&self) -> &Transaction { + &self.id + } + + async fn load_or_init<'a>( + op_manager: &'a OpManager, + msg: &'a Self::Message, + ) -> Result, OpError> { + let tx = *msg.id(); + match op_manager.pop(msg.id()) { + Ok(Some(OpEnum::ConnectV2(op))) => Ok(OpInitialization { + op: *op, + sender: msg.sender().cloned(), + }), + Ok(Some(other)) => { + op_manager.push(tx, other).await?; + Err(OpError::OpNotPresent(tx)) + } + Ok(None) => { + let op = match msg { + ConnectMsgV2::Request { from, payload, .. } => { + ConnectOpV2::new_relay(tx, from.clone(), payload.clone()) + } + _ => { + tracing::debug!(%tx, "connect_v2 received message without existing state"); + return Err(OpError::OpNotPresent(tx)); + } + }; + Ok(OpInitialization { op, sender: None }) + } + Err(err) => Err(err.into()), + } + } + + fn process_message<'a, NB: NetworkBridge>( + mut self, + network_bridge: &'a mut NB, + op_manager: &'a OpManager, + msg: &'a Self::Message, + ) -> std::pin::Pin< + Box> + Send + 'a>, + > { + Box::pin(async move { + match msg { + ConnectMsgV2::Request { from, payload, .. } => { + let env = RelayEnv::new(op_manager); + let actions = + self.handle_request(&env, from.clone(), payload.clone(), from.peer.addr); + + if let Some((target, address)) = actions.observed_address { + let msg = ConnectMsgV2::ObservedAddress { + id: self.id, + target: target.clone(), + address, + }; + network_bridge + .send(&target.peer, NetMessage::V1(NetMessageV1::ConnectV2(msg))) + .await?; + } + + if let Some(peer) = actions.expect_connection_from { + op_manager + .notify_node_event(NodeEvent::ExpectPeerConnection { + peer: peer.peer.clone(), + }) + .await?; + } + + if let Some((next, request)) = actions.forward { + let forward_msg = ConnectMsgV2::Request { + id: self.id, + from: env.self_location().clone(), + target: next.clone(), + payload: request, + }; + network_bridge + .send( + &next.peer, + NetMessage::V1(NetMessageV1::ConnectV2(forward_msg)), + ) + .await?; + } + + if let Some(response) = actions.accept_response { + let response_msg = ConnectMsgV2::Response { + id: self.id, + sender: env.self_location().clone(), + target: from.clone(), + payload: response, + }; + return Ok(store_operation_state_with_msg( + &mut self, + Some(response_msg), + )); + } + + Ok(store_operation_state(&mut self)) + } + ConnectMsgV2::Response { payload, .. } => { + if let Some(acceptance) = self.handle_response(payload, Instant::now()) { + if let Some(new_acceptor) = acceptance.new_acceptor { + op_manager + .notify_node_event( + crate::message::NodeEvent::ExpectPeerConnection { + peer: new_acceptor.peer.peer.clone(), + }, + ) + .await?; + + let (callback, mut rx) = mpsc::channel(1); + op_manager + .notify_node_event(NodeEvent::ConnectPeer { + peer: new_acceptor.peer.peer.clone(), + tx: self.id, + callback, + is_gw: new_acceptor.courtesy, + }) + .await?; + + if let Some(result) = rx.recv().await { + if let Ok((peer_id, _remaining)) = result { + tracing::info!(%peer_id, tx=%self.id, "connect_v2 joined peer"); + } else { + tracing::warn!(tx=%self.id, "connect_v2 ConnectPeer failed"); + } + } + } + + if acceptance.satisfied { + self.state = Some(ConnectState::Completed); + } + } + + Ok(store_operation_state(&mut self)) + } + ConnectMsgV2::ObservedAddress { address, .. } => { + self.handle_observed_address(*address, Instant::now()); + Ok(store_operation_state(&mut self)) + } + } + }) + } +} + struct VisitedPeerIds<'a> { peers: &'a [PeerKeyLocation], } @@ -483,6 +674,28 @@ fn push_unique_peer(list: &mut Vec, peer: PeerKeyLocation) { } } +fn store_operation_state(op: &mut ConnectOpV2) -> OperationResult { + store_operation_state_with_msg(op, None) +} + +fn store_operation_state_with_msg( + op: &mut ConnectOpV2, + msg: Option, +) -> OperationResult { + let state_clone = op.state.clone(); + OperationResult { + return_msg: msg.map(|m| NetMessage::V1(NetMessageV1::ConnectV2(m))), + state: state_clone.map(|state| { + OpEnum::ConnectV2(Box::new(ConnectOpV2 { + id: op.id, + state: Some(state), + gateway: op.gateway.clone(), + backoff: op.backoff.clone(), + })) + }), + } +} + #[cfg(test)] mod tests { use super::*; @@ -643,16 +856,16 @@ mod tests { let desired = Location::random(); let ttl = 5; let own = make_peer(7300); - let (_tx, op, msg) = ConnectOpV2::initiate_join_request( - own.clone(), - target.clone(), - desired, - ttl, - 2, - ); + let (_tx, op, msg) = + ConnectOpV2::initiate_join_request(own.clone(), target.clone(), desired, ttl, 2); match msg { - ConnectMsgV2::Request { from, target: msg_target, payload, .. } => { + ConnectMsgV2::Request { + from, + target: msg_target, + payload, + .. + } => { assert_eq!(msg_target.peer, target.peer); assert_eq!(payload.desired_location, desired); assert_eq!(payload.ttl, ttl); @@ -662,6 +875,9 @@ mod tests { other => panic!("unexpected message: {other:?}"), } - assert!(matches!(op.state, Some(ConnectState::WaitingForResponses(_)))); + assert!(matches!( + op.state, + Some(ConnectState::WaitingForResponses(_)) + )); } } diff --git a/crates/core/src/operations/mod.rs b/crates/core/src/operations/mod.rs index 70b1af3bd..b2bf7e70f 100644 --- a/crates/core/src/operations/mod.rs +++ b/crates/core/src/operations/mod.rs @@ -202,6 +202,7 @@ where pub(crate) enum OpEnum { Connect(Box), + ConnectV2(Box), Put(put::PutOp), Get(get::GetOp), Subscribe(subscribe::SubscribeOp), @@ -212,6 +213,7 @@ impl OpEnum { delegate::delegate! { to match self { OpEnum::Connect(op) => op, + OpEnum::ConnectV2(op) => op, OpEnum::Put(op) => op, OpEnum::Get(op) => op, OpEnum::Subscribe(op) => op, diff --git a/crates/core/src/util/mod.rs b/crates/core/src/util/mod.rs index 68ce10da6..72959528a 100644 --- a/crates/core/src/util/mod.rs +++ b/crates/core/src/util/mod.rs @@ -68,7 +68,7 @@ pub fn set_cleanup_on_exit(config: Arc) -> Result<(), ctrlc::Error> }) } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Backoff { attempt: usize, max_attempts: usize, @@ -77,7 +77,7 @@ pub struct Backoff { strategy: BackoffStrategy, } -#[derive(Debug)] +#[derive(Debug, Clone)] enum BackoffStrategy { Exponential, Logarithmic { interval_reduction_factor: f64 }, From 279a3f8f3307deeb7353905540e04aeea97d63a1 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 02:21:29 +0100 Subject: [PATCH 20/26] feat(connect): route connect_v2 messages through op manager --- crates/core/src/node/mod.rs | 60 +++++++++++++++++++ .../src/node/network_bridge/p2p_protoc.rs | 9 ++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/crates/core/src/node/mod.rs b/crates/core/src/node/mod.rs index bd73ed845..4b3c2b433 100644 --- a/crates/core/src/node/mod.rs +++ b/crates/core/src/node/mod.rs @@ -42,6 +42,7 @@ use crate::{ message::{InnerMessage, NetMessage, Transaction, TransactionType}, operations::{ connect::{self, ConnectOp}, + connect_v2::ConnectOpV2, get, put, subscribe, update, OpEnum, OpError, OpOutcome, }, ring::{Location, PeerKeyLocation}, @@ -715,6 +716,29 @@ async fn process_message_v1( ) .await; } + NetMessageV1::ConnectV2(ref op) => { + let parent_span = tracing::Span::current(); + let span = tracing::info_span!( + parent: parent_span, + "handle_connect_v2_op_request", + transaction = %msg.id(), + tx_type = %msg.id().transaction_type() + ); + let op_result = + handle_op_request::(&op_manager, &mut conn_manager, op) + .instrument(span) + .await; + + handle_op_not_available!(op_result); + return report_result( + tx, + op_result, + &op_manager, + executor_callback, + &mut *event_listener, + ) + .await; + } NetMessageV1::Put(ref op) => { let op_result = handle_op_request::(&op_manager, &mut conn_manager, op).await; @@ -889,6 +913,42 @@ where ) .await; } + NetMessageV1::ConnectV2(ref op) => { + let parent_span = tracing::Span::current(); + let span = tracing::info_span!( + parent: parent_span, + "handle_connect_v2_op_request", + transaction = %msg.id(), + tx_type = %msg.id().transaction_type() + ); + let op_result = + handle_op_request::(&op_manager, &mut conn_manager, op) + .instrument(span) + .await; + + if let Err(OpError::OpNotAvailable(state)) = &op_result { + match state { + OpNotAvailable::Running => { + tracing::debug!("Pure network: Operation still running"); + tokio::time::sleep(Duration::from_micros(1_000)).await; + continue; + } + OpNotAvailable::Completed => { + tracing::debug!("Pure network: Operation already completed"); + return Ok(None); + } + } + } + + return handle_pure_network_result( + tx, + op_result, + &op_manager, + executor_callback, + &mut *event_listener, + ) + .await; + } NetMessageV1::Put(ref op) => { tracing::debug!( tx = %op.id(), diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index e9b895be0..c55045f3e 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -29,7 +29,9 @@ use crate::node::network_bridge::handshake::{ use crate::node::network_bridge::priority_select; use crate::node::subscribe::SubscribeMsg; use crate::node::{MessageProcessor, PeerId}; -use crate::operations::{connect::ConnectMsg, get::GetMsg, put::PutMsg, update::UpdateMsg}; +use crate::operations::{ + connect::ConnectMsg, connect_v2::ConnectMsgV2, get::GetMsg, put::PutMsg, update::UpdateMsg, +}; use crate::ring::Location; use crate::transport::{ create_connection_handler, OutboundConnectionHandler, PeerConnection, TransportError, @@ -2044,6 +2046,11 @@ fn extract_sender_from_message(msg: &NetMessage) -> Option { ConnectMsg::Request { target, .. } => Some(target.clone()), _ => None, }, + NetMessageV1::ConnectV2(connect_msg) => match connect_msg { + ConnectMsgV2::Response { sender, .. } => Some(sender.clone()), + ConnectMsgV2::Request { from, .. } => Some(from.clone()), + ConnectMsgV2::ObservedAddress { target, .. } => Some(target.clone()), + }, // Get messages have sender in some variants NetMessageV1::Get(get_msg) => match get_msg { GetMsg::SeekNode { sender, .. } => Some(sender.clone()), From b242d5a52aa8551a1a61285cede686f80b409237 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 04:10:23 +0100 Subject: [PATCH 21/26] fix(connect): use connect_v2 for maintenance and join --- crates/core/src/node/p2p_impl.rs | 1 + .../core/src/node/testing_impl/in_memory.rs | 1 + crates/core/src/operations/connect.rs | 228 ++++-------------- crates/core/src/ring/connection_manager.rs | 1 + crates/core/src/ring/mod.rs | 126 +++++----- 5 files changed, 113 insertions(+), 244 deletions(-) diff --git a/crates/core/src/node/p2p_impl.rs b/crates/core/src/node/p2p_impl.rs index fa50eb732..a062ef2fa 100644 --- a/crates/core/src/node/p2p_impl.rs +++ b/crates/core/src/node/p2p_impl.rs @@ -259,6 +259,7 @@ impl NodeP2P { connection_manager, result_router_tx, )?); + op_manager.ring.attach_op_manager(&op_manager); let (executor_listener, executor_sender) = contract::executor_channel(op_manager.clone()); let contract_handler = CH::build(ch_inbound, executor_sender, ch_builder) .await diff --git a/crates/core/src/node/testing_impl/in_memory.rs b/crates/core/src/node/testing_impl/in_memory.rs index 785db58a2..adde6de93 100644 --- a/crates/core/src/node/testing_impl/in_memory.rs +++ b/crates/core/src/node/testing_impl/in_memory.rs @@ -46,6 +46,7 @@ impl Builder { connection_manager.clone(), result_router_tx, )?); + op_manager.ring.attach_op_manager(&op_manager); std::mem::drop(_guard); let (executor_listener, executor_sender) = executor_channel(op_manager.clone()); let contract_handler = diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 993be2a0d..237904c07 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -9,7 +9,11 @@ use freenet_stdlib::client_api::HostResponse; use futures::{Future, StreamExt}; pub(crate) use self::messages::{ConnectMsg, ConnectRequest, ConnectResponse}; -use super::{connect, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; +use super::{ + connect, + connect_v2::{ConnectMsgV2, ConnectOpV2}, + OpError, OpInitialization, OpOutcome, Operation, OperationResult, +}; use crate::client_events::HostResult; use crate::dev_tool::Location; use crate::message::{NetMessageV1, NodeEvent}; @@ -773,6 +777,7 @@ type Requester = PeerKeyLocation; #[derive(Debug)] pub enum ConnectState { Initializing, + #[allow(dead_code)] ConnectingToNode(ConnectionInfo), AwaitingConnectivity(ConnectivityInfo), AwaitingConnectionAcquisition, @@ -826,6 +831,7 @@ pub(crate) struct NewConnectionInfo { } impl ConnectState { + #[allow(dead_code)] fn try_unwrap_connecting(self) -> Result { if let Self::ConnectingToNode(conn_info) = self { Ok(conn_info) @@ -958,199 +964,67 @@ pub(crate) async fn join_ring_request( op_manager: &OpManager, ) -> Result<(), OpError> { use crate::node::ConnectionError; - if !op_manager.ring.connection_manager.should_accept( - gateway.location.ok_or_else(|| { - tracing::error!( - "Gateway location not found, this should not be possible, report an error" - ); - OpError::ConnError(ConnectionError::LocationUnknown) - })?, - &gateway.peer, - ) { - // ensure that we still want to connect AND reserve an spot implicitly + let location = gateway.location.ok_or_else(|| { + tracing::error!("Gateway location not found, this should not be possible, report an error"); + OpError::ConnError(ConnectionError::LocationUnknown) + })?; + + if !op_manager + .ring + .connection_manager + .should_accept(location, &gateway.peer) + { return Err(OpError::ConnError(ConnectionError::UnwantedConnection)); } - let tx_id = Transaction::new::(); - tracing::info!(%gateway.peer, "Attempting network join"); - let mut op = initial_request(gateway.clone(), op_manager.ring.max_hops_to_live, tx_id); - if let Some(mut backoff) = backoff { - // backoff to retry later in case it failed - tracing::warn!("Performing a new join, attempt {}", backoff.retries() + 1); - if backoff.sleep().await.is_none() { + let mut backoff = backoff; + if let Some(backoff_state) = backoff.as_mut() { + tracing::warn!( + "Performing a new join, attempt {}", + backoff_state.retries() + 1 + ); + if backoff_state.sleep().await.is_none() { tracing::error!("Max number of retries reached"); if op_manager.ring.open_connections() == 0 { - // only consider this a complete failure if no connections were established at all - // if connections where established the peer should incrementally acquire more over time - return Err(OpError::MaxRetriesExceeded(tx_id, tx_id.transaction_type())); + let tx = Transaction::new::(); + return Err(OpError::MaxRetriesExceeded(tx, tx.transaction_type())); } else { return Ok(()); } } - // on first run the backoff will be initialized at the `initial_request` function - // if the op was to fail and retried this function will be called with the previous backoff - // passed as an argument and advanced - op.backoff = Some(backoff); } - connect_request(tx_id, op_manager, op).await?; - Ok(()) -} -fn initial_request( - gateway: PeerKeyLocation, - max_hops_to_live: usize, - id: Transaction, -) -> ConnectOp { - const MAX_JOIN_RETRIES: usize = usize::MAX; - let state = ConnectState::ConnectingToNode(ConnectionInfo { - gateway: gateway.clone(), - accepted_by: HashSet::new(), - remaining_connections: max_hops_to_live, - }); - let ceiling = if cfg!(test) { - Duration::from_secs(1) - } else { - Duration::from_secs(120) - }; - ConnectOp { - id, - state: Some(state), - gateway: Some(Box::new(gateway)), - backoff: Some(Backoff::new( - Duration::from_secs(1), - ceiling, - MAX_JOIN_RETRIES, - )), + let own = op_manager.ring.connection_manager.own_location(); + let ttl = op_manager + .ring + .max_hops_to_live + .max(1) + .min(u8::MAX as usize) as u8; + let target_connections = op_manager.ring.connection_manager.min_connections; + + let (tx, mut op, msg) = ConnectOpV2::initiate_join_request( + own.clone(), + gateway.clone(), + location, + ttl, + target_connections, + ); + + op.gateway = Some(Box::new(gateway.clone())); + if let Some(backoff) = backoff { + op.backoff = Some(backoff); } -} -/// Join ring routine, called upon performing a join operation for this node. -async fn connect_request( - tx: Transaction, - op_manager: &OpManager, - join_op: ConnectOp, -) -> Result<(), OpError> { - let ConnectOp { - id, state, backoff, .. - } = join_op; - let ConnectionInfo { gateway, .. } = state.expect("infallible").try_unwrap_connecting()?; + tracing::info!(%gateway.peer, tx = %tx, "Attempting network join using connect_v2"); - tracing::info!( - tx = %id, - gateway = %gateway, - "Connecting to gateway", - ); - - let (callback, mut result) = tokio::sync::mpsc::channel(10); op_manager - .notify_node_event(NodeEvent::ConnectPeer { - peer: gateway.peer.clone(), - tx, - callback, - is_gw: true, - }) + .notify_op_change( + NetMessage::V1(NetMessageV1::ConnectV2(msg)), + OpEnum::ConnectV2(Box::new(op)), + ) .await?; - match result.recv().await.ok_or(OpError::NotificationError)? { - Ok((joiner, remaining_checks)) => { - op_manager - .ring - .add_connection( - gateway.location.expect("location not found"), - gateway.peer.clone(), - true, - ) - .await; - let Some(remaining_connections) = remaining_checks else { - tracing::error!(tx = %id, "Failed to connect to gateway, missing remaining checks"); - return Err(OpError::ConnError( - crate::node::ConnectionError::FailedConnectOp, - )); - }; - tracing::debug!( - tx = %id, - gateway = %gateway, - joiner = %joiner, - "Sending connection request to gateway", - ); - - // Update state to indicate we're waiting for new connections - op_manager - .push( - tx, - OpEnum::Connect(Box::new(ConnectOp { - id, - state: Some(ConnectState::AwaitingNewConnection(NewConnectionInfo { - remaining_connections, - })), - gateway: Some(Box::new(gateway.clone())), - backoff, - })), - ) - .await?; - - // After connecting to gateway, immediately request to find more peers - // We'll create a new transaction for this follow-up request - let new_tx_id = Transaction::new::(); - let ideal_location = Location::random(); - let joiner_location = op_manager.ring.connection_manager.own_location(); - - // Track this transaction so connection maintenance knows about it - op_manager - .ring - .live_tx_tracker - .add_transaction(gateway.peer.clone(), new_tx_id); - - let msg = ConnectMsg::Request { - id: new_tx_id, - target: gateway.clone(), - msg: ConnectRequest::FindOptimalPeer { - query_target: gateway.clone(), - ideal_location, - joiner: joiner_location, - max_hops_to_live: op_manager.ring.max_hops_to_live, - skip_connections: HashSet::from([joiner.clone()]), - skip_forwards: HashSet::new(), - }, - }; - tracing::info!( - tx = %new_tx_id, - gateway = %gateway.peer, - ideal_location = %ideal_location, - "Immediately requesting more peer connections from gateway" - ); - - // Send the message through the op_manager's notification system - // We need to create a new ConnectOp for this new transaction - let new_op = ConnectOp::new( - new_tx_id, - Some(ConnectState::AwaitingNewConnection(NewConnectionInfo { - remaining_connections: op_manager.ring.max_hops_to_live, - })), - Some(Box::new(gateway.clone())), - None, - ); - - // Push the new operation - op_manager - .push(new_tx_id, OpEnum::Connect(Box::new(new_op))) - .await?; - - // Send the FindOptimalPeer message to the gateway over the network - // We use notify_node_event with a SendMessage event to ensure it goes through - // the proper network channel, not just local processing - op_manager - .notify_node_event(NodeEvent::SendMessage { - target: gateway.peer.clone(), - msg: Box::new(NetMessage::from(msg)), - }) - .await?; - Ok(()) - } - Err(_) => Err(OpError::ConnError( - crate::node::ConnectionError::FailedConnectOp, - )), - } + Ok(()) } pub(crate) struct ForwardParams { diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index bd9d6a7b9..1e1e8858a 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -472,6 +472,7 @@ impl ConnectionManager { total } + #[allow(dead_code)] pub(super) fn connected_peers(&self) -> impl Iterator { let read = self.location_for_peer.read(); read.keys().cloned().collect::>().into_iter() diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index ec07cab30..3a8aae2ed 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -10,7 +10,7 @@ use std::{ collections::BTreeMap, sync::{ atomic::{AtomicU64, AtomicUsize}, - Arc, + Arc, Weak, }, time::{Duration, Instant}, }; @@ -33,9 +33,9 @@ use crate::transport::TransportPublicKey; use crate::util::Contains; use crate::{ config::GlobalExecutor, - message::Transaction, - node::{self, EventLoopNotificationsSender, NodeConfig, PeerId}, - operations::connect, + message::{NetMessage, NetMessageV1, Transaction}, + node::{self, EventLoopNotificationsSender, NodeConfig, OpManager, PeerId}, + operations::{connect_v2::ConnectOpV2, OpEnum}, router::Router, }; @@ -68,6 +68,7 @@ pub(crate) struct Ring { pub live_tx_tracker: LiveTransactionTracker, seeding_manager: seeding::SeedingManager, event_register: Box, + op_manager: RwLock>>, /// Whether this peer is a gateway or not. This will affect behavior of the node when acquiring /// and dropping connections. pub(crate) is_gateway: bool, @@ -122,6 +123,7 @@ impl Ring { seeding_manager: seeding::SeedingManager::new(), live_tx_tracker: live_tx_tracker.clone(), event_register: Box::new(event_register), + op_manager: RwLock::new(None), is_gateway, }; @@ -145,10 +147,20 @@ impl Ring { .connection_maintenance(event_loop_notifier, live_tx_tracker, missing_candidate_rx) .instrument(span), ); - Ok(ring) } + pub fn attach_op_manager(&self, op_manager: &Arc) { + self.op_manager.write().replace(Arc::downgrade(op_manager)); + } + + fn upgrade_op_manager(&self) -> Option> { + self.op_manager + .read() + .as_ref() + .and_then(|weak| weak.clone().upgrade()) + } + pub fn is_gateway(&self) -> bool { self.is_gateway } @@ -435,6 +447,13 @@ impl Ring { let mut pending_conn_adds = BTreeSet::new(); let mut this_peer = None; loop { + let op_manager = match self.upgrade_op_manager() { + Some(op_manager) => op_manager, + None => { + tokio::time::sleep(Duration::from_millis(100)).await; + continue; + } + }; let Some(this_peer) = &this_peer else { let Some(peer) = self.connection_manager.get_peer_key() else { tokio::time::sleep(Duration::from_secs(1)).await; @@ -481,7 +500,13 @@ impl Ring { ideal_location ); live_tx = self - .acquire_new(ideal_location, &skip_list, ¬ifier, &live_tx_tracker) + .acquire_new( + ideal_location, + &skip_list, + ¬ifier, + &live_tx_tracker, + &op_manager, + ) .await .map_err(|error| { tracing::error!( @@ -611,13 +636,14 @@ impl Ring { } } - #[tracing::instrument(level = "debug", skip(self, notifier, live_tx_tracker), fields(peer = %self.connection_manager.pub_key))] + #[tracing::instrument(level = "debug", skip(self, notifier, live_tx_tracker, op_manager), fields(peer = %self.connection_manager.pub_key))] async fn acquire_new( &self, ideal_location: Location, skip_list: &HashSet<&PeerId>, notifier: &EventLoopNotificationsSender, live_tx_tracker: &LiveTransactionTracker, + op_manager: &Arc, ) -> anyhow::Result> { let current_connections = self.connection_manager.get_open_connections(); let is_gateway = self.is_gateway; @@ -628,29 +654,6 @@ impl Ring { "acquire_new: attempting to find peer to query" ); - // CRITICAL: Use separate skip lists for routing vs. connection requests - // - // The routing skip list determines who we can ASK for peer recommendations. - // The connection skip list determines who we DON'T want to connect to. - // - // For peers with few connections (e.g., only gateway), we MUST be able to - // route through existing connections to discover new peers. If we filter out - // existing connections from routing, peers get stuck unable to find anyone to ask. - // - // Example scenario: - // - Peer has 1 connection (gateway) - // - Topology manager suggests random location for diversity - // - Old code: adds gateway to routing skip list → routing() returns None → no request sent - // - New code: routes through gateway → gateway helps discover other peers → mesh forms - // - // The skip list for routing should only exclude: - // - This peer itself - // - Peers we've already tried and failed with (missing candidates) - // - // The skip list for the FindOptimalPeer request should also exclude: - // - Already connected peers (to avoid reconnecting) - - // Find a peer to query (allow routing through existing connections) let query_target = { let router = self.router.read(); let num_connections = self.connection_manager.num_connections(); @@ -660,62 +663,51 @@ impl Ring { skip_list_size = skip_list.len(), "Looking for peer to route through" ); - if let Some(t) = self.connection_manager.routing( - ideal_location, - None, - skip_list, // Use just the input skip list (missing candidates + self) - &router, - ) { - tracing::debug!(query_target = %t, "Found routing target"); - t + if let Some(target) = + self.connection_manager + .routing(ideal_location, None, skip_list, &router) + { + tracing::debug!(query_target = %target, "Found routing target"); + target } else { tracing::warn!( "acquire_new: routing() returned None - cannot find peer to query (connections: {}, is_gateway: {})", current_connections, is_gateway ); - return Ok(None); } }; - // Create skip list for the FindOptimalPeer request (includes already connected peers) - let connection_skip_list: HashSet = skip_list - .iter() - .copied() - .cloned() - .chain(self.connection_manager.connected_peers()) - .collect(); - let joiner = self.connection_manager.own_location(); tracing::info!( this_peer = %joiner, query_target_peer = %query_target.peer, %ideal_location, - skip_connections_count = connection_skip_list.len(), - "Sending FindOptimalPeer request via connection_maintenance" + "Sending ConnectV2 request via connection_maintenance" ); - let missing_connections = self.connection_manager.max_connections - self.open_connections(); - let id = Transaction::new::(); - live_tx_tracker.add_transaction(query_target.peer.clone(), id); - let msg = connect::ConnectMsg::Request { - id, - target: query_target.clone(), - msg: connect::ConnectRequest::FindOptimalPeer { - query_target, - ideal_location, - joiner, - max_hops_to_live: missing_connections, - skip_connections: connection_skip_list, - skip_forwards: HashSet::new(), - }, - }; + let ttl = self.max_hops_to_live.max(1).min(u8::MAX as usize) as u8; + let target_connections = self.connection_manager.min_connections; + + let (tx, op, msg) = ConnectOpV2::initiate_join_request( + joiner, + query_target.clone(), + ideal_location, + ttl, + target_connections, + ); + + live_tx_tracker.add_transaction(query_target.peer.clone(), tx); + op_manager + .push(tx, OpEnum::ConnectV2(Box::new(op))) + .await + .map_err(|err| anyhow::anyhow!(err))?; notifier .notifications_sender - .send(Either::Left(msg.into())) + .send(Either::Left(NetMessage::V1(NetMessageV1::ConnectV2(msg)))) .await?; - tracing::info!(tx = %id, "FindOptimalPeer request sent"); - Ok(Some(id)) + tracing::info!(tx = %tx, "ConnectV2 request sent"); + Ok(Some(tx)) } } From b9103295cca3f564b208a968377005a015397fc0 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 04:44:20 +0100 Subject: [PATCH 22/26] fix(connect): trigger maintenance via connect_v2 --- crates/core/src/node/p2p_impl.rs | 44 ++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/crates/core/src/node/p2p_impl.rs b/crates/core/src/node/p2p_impl.rs index a062ef2fa..1d6269cab 100644 --- a/crates/core/src/node/p2p_impl.rs +++ b/crates/core/src/node/p2p_impl.rs @@ -20,9 +20,9 @@ use crate::{ self, ContractHandler, ContractHandlerChannel, ExecutorToEventLoopChannel, NetworkEventListenerHalve, WaitingResolution, }, - message::{NetMessage, NodeEvent, Transaction}, + message::{NetMessage, NetMessageV1, NodeEvent}, node::NodeConfig, - operations::{connect, OpEnum}, + operations::{connect, connect_v2::ConnectOpV2, OpEnum}, }; use super::OpManager; @@ -131,10 +131,7 @@ impl NodeP2P { /// Trigger the connection maintenance task to actively look for more peers async fn trigger_connection_maintenance(&self) -> anyhow::Result<()> { - // Send a connect request to find more peers - use crate::operations::connect; let ideal_location = Location::random(); - let tx = Transaction::new::(); // Find a connected peer to query let query_target = { @@ -149,23 +146,32 @@ impl NodeP2P { if let Some(query_target) = query_target { let joiner = self.op_manager.ring.connection_manager.own_location(); - let msg = connect::ConnectMsg::Request { - id: tx, - target: query_target.clone(), - msg: connect::ConnectRequest::FindOptimalPeer { - query_target, - ideal_location, - joiner, - max_hops_to_live: self.op_manager.ring.max_hops_to_live, - skip_connections: HashSet::new(), - skip_forwards: HashSet::new(), - }, - }; + let ttl = self + .op_manager + .ring + .max_hops_to_live + .max(1) + .min(u8::MAX as usize) as u8; + let target_connections = self.op_manager.ring.connection_manager.min_connections; + + let (tx, op, msg) = ConnectOpV2::initiate_join_request( + joiner, + query_target.clone(), + ideal_location, + ttl, + target_connections, + ); + tracing::debug!( + %tx, + query_peer = %query_target.peer, + %ideal_location, + "Triggering connection maintenance connect_v2 request" + ); self.op_manager .notify_op_change( - NetMessage::from(msg), - OpEnum::Connect(Box::new(connect::ConnectOp::new(tx, None, None, None))), + NetMessage::V1(NetMessageV1::ConnectV2(msg)), + OpEnum::ConnectV2(Box::new(op)), ) .await?; } From 0dddfecd485ec3eb6cb02e6ed152d04894eb0621 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 6 Nov 2025 16:37:25 +0100 Subject: [PATCH 23/26] refactor(connect): checkpoint connect-v2 migration work --- crates/core/src/node/mod.rs | 10 +- crates/core/src/node/network_bridge.rs | 1 + .../src/node/network_bridge/handshake_v2.rs | 126 ++++++++++++ crates/core/src/node/p2p_impl.rs | 12 +- crates/core/src/node/testing_impl.rs | 4 +- crates/core/src/operations/connect.rs | 190 +----------------- crates/core/src/operations/connect_v2.rs | 187 ++++++++++++++++- crates/core/src/router/isotonic_estimator.rs | 9 +- crates/core/src/test_utils.rs | 19 +- .../src/topology/request_density_tracker.rs | 5 +- crates/core/src/tracing/mod.rs | 4 +- .../peer_connection/outbound_stream.rs | 5 +- crates/core/src/wasm_runtime/store.rs | 19 +- .../wasm_runtime/tests/contract_metering.rs | 9 +- crates/core/src/wasm_runtime/tests/mod.rs | 5 +- crates/core/tests/connectivity.rs | 6 +- crates/core/tests/error_notification.rs | 52 ++--- crates/core/tests/isolated_node_regression.rs | 63 +++--- crates/core/tests/operations.rs | 4 +- crates/core/tests/redb_migration.rs | 9 +- 20 files changed, 439 insertions(+), 300 deletions(-) create mode 100644 crates/core/src/node/network_bridge/handshake_v2.rs diff --git a/crates/core/src/node/mod.rs b/crates/core/src/node/mod.rs index 4b3c2b433..669078d3d 100644 --- a/crates/core/src/node/mod.rs +++ b/crates/core/src/node/mod.rs @@ -42,7 +42,7 @@ use crate::{ message::{InnerMessage, NetMessage, Transaction, TransactionType}, operations::{ connect::{self, ConnectOp}, - connect_v2::ConnectOpV2, + connect_v2::{self, ConnectOpV2}, get, put, subscribe, update, OpEnum, OpError, OpOutcome, }, ring::{Location, PeerKeyLocation}, @@ -1224,7 +1224,7 @@ async fn handle_aborted_op( } = *op; if let Some(gateway) = gateway { tracing::warn!("Retry connecting to gateway {}", gateway.peer); - connect::join_ring_request(backoff, &gateway, op_manager).await?; + connect_v2::join_ring_request(backoff, &gateway, op_manager).await?; } } Ok(Some(OpEnum::ConnectV2(op))) @@ -1235,7 +1235,7 @@ async fn handle_aborted_op( let gateway = op.gateway().cloned(); if let Some(gateway) = gateway { tracing::warn!("Retry connecting to gateway {}", gateway.peer); - connect::join_ring_request(None, &gateway, op_manager).await?; + connect_v2::join_ring_request(None, &gateway, op_manager).await?; } } Ok(Some(OpEnum::Connect(_))) => { @@ -1243,7 +1243,7 @@ async fn handle_aborted_op( if op_manager.ring.open_connections() == 0 && op_manager.ring.is_gateway() { tracing::warn!("Retrying joining the ring with an other gateway"); if let Some(gateway) = gateways.iter().shuffle().next() { - connect::join_ring_request(None, gateway, op_manager).await? + connect_v2::join_ring_request(None, gateway, op_manager).await? } } } @@ -1251,7 +1251,7 @@ async fn handle_aborted_op( if op_manager.ring.open_connections() == 0 && op_manager.ring.is_gateway() { tracing::warn!("Retrying joining the ring with an other gateway"); if let Some(gateway) = gateways.iter().shuffle().next() { - connect::join_ring_request(None, gateway, op_manager).await? + connect_v2::join_ring_request(None, gateway, op_manager).await? } } } diff --git a/crates/core/src/node/network_bridge.rs b/crates/core/src/node/network_bridge.rs index df659f637..0caa11d09 100644 --- a/crates/core/src/node/network_bridge.rs +++ b/crates/core/src/node/network_bridge.rs @@ -17,6 +17,7 @@ use super::PeerId; use crate::message::{NetMessage, NodeEvent}; mod handshake; +mod handshake_v2; pub(crate) mod in_memory; pub(crate) mod p2p_protoc; pub(crate) mod priority_select; diff --git a/crates/core/src/node/network_bridge/handshake_v2.rs b/crates/core/src/node/network_bridge/handshake_v2.rs new file mode 100644 index 000000000..6fe9ffac8 --- /dev/null +++ b/crates/core/src/node/network_bridge/handshake_v2.rs @@ -0,0 +1,126 @@ +//! Placeholder implementation for the upcoming ConnectV2 handshake flow. +//! +//! The new handshake pipeline is still under active development. For now we keep a +//! compile-time stub so the surrounding modules that reference this file continue +//! to build while we flesh out the full state machine. + +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use futures::Stream; +use tokio::sync::mpsc; + +use crate::dev_tool::{Location, PeerId, Transaction}; +use crate::ring::ConnectionManager; +use crate::router::Router; +use crate::transport::{InboundConnectionHandler, OutboundConnectionHandler, PeerConnection}; + +/// Events that will eventually be emitted by the ConnectV2 handshake handler. +#[derive(Debug)] +#[allow(dead_code)] +pub(crate) enum Event { + InboundConnection { + transaction: Transaction, + connection: PeerConnection, + joiner: PeerId, + courtesy: bool, + }, + OutboundEstablished { + transaction: Transaction, + peer: PeerId, + connection: PeerConnection, + courtesy: bool, + }, + OutboundFailed { + transaction: Transaction, + peer: PeerId, + courtesy: bool, + }, +} + +/// Commands delivered from the event loop into the handshake handler. +#[derive(Debug)] +#[allow(dead_code)] +pub(crate) enum Command { + Connect { + peer: PeerId, + transaction: Transaction, + courtesy: bool, + }, + DropConnection { + peer: PeerId, + }, +} + +#[allow(dead_code)] +pub(crate) struct CommandSender(mpsc::Sender); + +impl CommandSender { + #[allow(dead_code)] + pub async fn send(&self, cmd: Command) -> Result<(), mpsc::error::SendError> { + self.0.send(cmd).await + } +} + +/// Temporary stub implementation that just keeps channels alive. +#[allow(dead_code)] +pub(crate) struct HandshakeHandler { + #[allow(dead_code)] + inbound: InboundConnectionHandler, + #[allow(dead_code)] + outbound: OutboundConnectionHandler, + #[allow(dead_code)] + connection_manager: ConnectionManager, + #[allow(dead_code)] + router: Arc, + #[allow(dead_code)] + this_location: Option, + #[allow(dead_code)] + is_gateway: bool, + #[allow(dead_code)] + peer_ready: Option>, + commands_rx: mpsc::Receiver, +} + +#[allow(clippy::too_many_arguments)] +#[allow(dead_code)] +impl HandshakeHandler { + #[allow(clippy::too_many_arguments)] + pub fn new( + inbound: InboundConnectionHandler, + outbound: OutboundConnectionHandler, + connection_manager: ConnectionManager, + router: Arc, + this_location: Option, + is_gateway: bool, + peer_ready: Option>, + ) -> (Self, CommandSender) { + let (tx, rx) = mpsc::channel(1); + ( + HandshakeHandler { + inbound, + outbound, + connection_manager, + router, + this_location, + is_gateway, + peer_ready, + commands_rx: rx, + }, + CommandSender(tx), + ) + } +} + +impl Stream for HandshakeHandler { + type Item = Event; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match Pin::new(&mut self.commands_rx).poll_recv(cx) { + Poll::Ready(Some(_cmd)) => Poll::Pending, + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } +} diff --git a/crates/core/src/node/p2p_impl.rs b/crates/core/src/node/p2p_impl.rs index 1d6269cab..6fab50975 100644 --- a/crates/core/src/node/p2p_impl.rs +++ b/crates/core/src/node/p2p_impl.rs @@ -22,7 +22,10 @@ use crate::{ }, message::{NetMessage, NetMessageV1, NodeEvent}, node::NodeConfig, - operations::{connect, connect_v2::ConnectOpV2, OpEnum}, + operations::{ + connect_v2::{self, ConnectOpV2}, + OpEnum, + }, }; use super::OpManager; @@ -180,8 +183,11 @@ impl NodeP2P { } pub(super) async fn run_node(self) -> anyhow::Result { if self.should_try_connect { - connect::initial_join_procedure(self.op_manager.clone(), &self.conn_manager.gateways) - .await?; + connect_v2::initial_join_procedure( + self.op_manager.clone(), + &self.conn_manager.gateways, + ) + .await?; // After connecting to gateways, aggressively try to reach min_connections // This is important for fast startup and avoiding on-demand connection delays diff --git a/crates/core/src/node/testing_impl.rs b/crates/core/src/node/testing_impl.rs index 562285c7b..6987d302e 100644 --- a/crates/core/src/node/testing_impl.rs +++ b/crates/core/src/node/testing_impl.rs @@ -28,7 +28,7 @@ use crate::{ dev_tool::TransportKeypair, message::{MessageStats, NetMessage, NetMessageV1, NodeEvent, Transaction}, node::{InitPeerNode, NetEventRegister, NodeConfig}, - operations::connect, + operations::connect_v2, ring::{Distance, Location, PeerKeyLocation}, tracing::TestEventListener, transport::TransportPublicKey, @@ -780,7 +780,7 @@ where NB: NetworkBridge + NetworkBridgeExt, UsrEv: ClientEventsProxy + Send + 'static, { - connect::initial_join_procedure(config.op_manager.clone(), &config.gateways).await?; + connect_v2::initial_join_procedure(config.op_manager.clone(), &config.gateways).await?; let (client_responses, _cli_response_sender) = contract::client_responses_channel(); let span = { config diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 237904c07..0c36d2202 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -3,17 +3,12 @@ use std::borrow::Borrow; use std::collections::HashSet; use std::pin::Pin; use std::sync::Arc; -use std::time::Duration; use freenet_stdlib::client_api::HostResponse; -use futures::{Future, StreamExt}; +use futures::Future; pub(crate) use self::messages::{ConnectMsg, ConnectRequest, ConnectResponse}; -use super::{ - connect, - connect_v2::{ConnectMsgV2, ConnectOpV2}, - OpError, OpInitialization, OpOutcome, Operation, OperationResult, -}; +use super::{connect, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; use crate::client_events::HostResult; use crate::dev_tool::Location; use crate::message::{NetMessageV1, NodeEvent}; @@ -846,187 +841,6 @@ impl ConnectState { /// - gateways: Inmutable list of known gateways. Passed when starting up the node. /// After the initial connections through the gateways are established all other connections /// (to gateways or regular peers) will be treated as regular connections. -pub(crate) async fn initial_join_procedure( - op_manager: Arc, - gateways: &[PeerKeyLocation], -) -> Result<(), OpError> { - use crate::util::IterExt; - let number_of_parallel_connections = { - let max_potential_conns_per_gw = op_manager.ring.max_hops_to_live; - // e.g. 10 gateways and htl 5 -> only need 2 connections in parallel - let needed_to_cover_max = - op_manager.ring.connection_manager.max_connections / max_potential_conns_per_gw; - // if we have 2 gws, we will at least attempt 2 parallel connections - gateways.iter().take(needed_to_cover_max).count().max(2) - }; - let gateways = gateways.to_vec(); - tokio::task::spawn(async move { - if gateways.is_empty() { - tracing::warn!("No gateways available, aborting join procedure"); - return; - } - - const WAIT_TIME: u64 = 1; - const LONG_WAIT_TIME: u64 = 30; - const BOOTSTRAP_THRESHOLD: usize = 4; - - tracing::info!( - "Starting initial join procedure with {} gateways", - gateways.len() - ); - - loop { - let open_conns = op_manager.ring.open_connections(); - let unconnected_gateways: Vec<_> = - op_manager.ring.is_not_connected(gateways.iter()).collect(); - - tracing::debug!( - "Connection status: open_connections = {}, unconnected_gateways = {}", - open_conns, - unconnected_gateways.len() - ); - - // Only try to connect to gateways if we have fewer than BOOTSTRAP_THRESHOLD connections - // This prevents overloading gateways once peers have basic connectivity - let unconnected_count = unconnected_gateways.len(); - - if open_conns < BOOTSTRAP_THRESHOLD && unconnected_count > 0 { - tracing::info!( - "Below bootstrap threshold ({} < {}), attempting to connect to {} gateways", - open_conns, - BOOTSTRAP_THRESHOLD, - number_of_parallel_connections.min(unconnected_count) - ); - let select_all = futures::stream::FuturesUnordered::new(); - for gateway in unconnected_gateways - .into_iter() - .shuffle() - .take(number_of_parallel_connections) - { - tracing::info!(%gateway, "Attempting connection to gateway"); - let op_manager = op_manager.clone(); - select_all.push(async move { - (join_ring_request(None, gateway, &op_manager).await, gateway) - }); - } - select_all.for_each(|(res, gateway)| async move { - if let Err(error) = res { - if !matches!( - error, - OpError::ConnError(crate::node::ConnectionError::UnwantedConnection) - ) { - tracing::error!(%gateway, %error, "Failed while attempting connection to gateway"); - } - } - }).await; - } else if open_conns >= BOOTSTRAP_THRESHOLD { - tracing::trace!( - "Have {} connections (>= threshold of {}), not attempting gateway connections", - open_conns, - BOOTSTRAP_THRESHOLD - ); - } - - // Determine wait time based on connection state - let wait_time = if open_conns == 0 { - // No connections at all - retry quickly - tracing::debug!("No connections yet, waiting {}s before retry", WAIT_TIME); - WAIT_TIME - } else if open_conns < BOOTSTRAP_THRESHOLD { - // Some connections but below threshold - moderate wait - tracing::debug!( - "Have {} connections (below threshold of {}), waiting {}s", - open_conns, - BOOTSTRAP_THRESHOLD, - WAIT_TIME * 3 - ); - WAIT_TIME * 3 - } else { - // Healthy connection pool - long wait - tracing::trace!( - "Connection pool healthy ({} connections), waiting {}s", - open_conns, - LONG_WAIT_TIME - ); - LONG_WAIT_TIME - }; - - tokio::time::sleep(Duration::from_secs(wait_time)).await; - } - }); - Ok(()) -} - -#[tracing::instrument(fields(peer = %op_manager.ring.connection_manager.pub_key), skip_all)] -pub(crate) async fn join_ring_request( - backoff: Option, - gateway: &PeerKeyLocation, - op_manager: &OpManager, -) -> Result<(), OpError> { - use crate::node::ConnectionError; - let location = gateway.location.ok_or_else(|| { - tracing::error!("Gateway location not found, this should not be possible, report an error"); - OpError::ConnError(ConnectionError::LocationUnknown) - })?; - - if !op_manager - .ring - .connection_manager - .should_accept(location, &gateway.peer) - { - return Err(OpError::ConnError(ConnectionError::UnwantedConnection)); - } - - let mut backoff = backoff; - if let Some(backoff_state) = backoff.as_mut() { - tracing::warn!( - "Performing a new join, attempt {}", - backoff_state.retries() + 1 - ); - if backoff_state.sleep().await.is_none() { - tracing::error!("Max number of retries reached"); - if op_manager.ring.open_connections() == 0 { - let tx = Transaction::new::(); - return Err(OpError::MaxRetriesExceeded(tx, tx.transaction_type())); - } else { - return Ok(()); - } - } - } - - let own = op_manager.ring.connection_manager.own_location(); - let ttl = op_manager - .ring - .max_hops_to_live - .max(1) - .min(u8::MAX as usize) as u8; - let target_connections = op_manager.ring.connection_manager.min_connections; - - let (tx, mut op, msg) = ConnectOpV2::initiate_join_request( - own.clone(), - gateway.clone(), - location, - ttl, - target_connections, - ); - - op.gateway = Some(Box::new(gateway.clone())); - if let Some(backoff) = backoff { - op.backoff = Some(backoff); - } - - tracing::info!(%gateway.peer, tx = %tx, "Attempting network join using connect_v2"); - - op_manager - .notify_op_change( - NetMessage::V1(NetMessageV1::ConnectV2(msg)), - OpEnum::ConnectV2(Box::new(op)), - ) - .await?; - - Ok(()) -} - pub(crate) struct ForwardParams { pub left_htl: usize, pub max_htl: usize, diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index ecb0966da..d298aa149 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -8,10 +8,13 @@ use std::collections::HashSet; use std::fmt; use std::net::SocketAddr; -use std::time::Instant; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use futures::{stream::FuturesUnordered, StreamExt}; use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; +use tokio::task; use crate::client_events::HostResult; use crate::dev_tool::Location; @@ -19,7 +22,7 @@ use crate::message::{InnerMessage, NetMessage, NetMessageV1, NodeEvent, Transact use crate::node::{IsOperationCompleted, NetworkBridge, OpManager, PeerId}; use crate::operations::{OpEnum, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; use crate::ring::PeerKeyLocation; -use crate::util::{Backoff, Contains}; +use crate::util::{Backoff, Contains, IterExt}; use freenet_stdlib::client_api::HostResponse; /// Top-level message envelope used by the new connect handshake. @@ -696,6 +699,186 @@ fn store_operation_state_with_msg( } } +#[tracing::instrument(fields(peer = %op_manager.ring.connection_manager.pub_key), skip_all)] +pub(crate) async fn join_ring_request( + backoff: Option, + gateway: &PeerKeyLocation, + op_manager: &OpManager, +) -> Result<(), OpError> { + use crate::node::ConnectionError; + let location = gateway.location.ok_or_else(|| { + tracing::error!("Gateway location not found, this should not be possible, report an error"); + OpError::ConnError(ConnectionError::LocationUnknown) + })?; + + if !op_manager + .ring + .connection_manager + .should_accept(location, &gateway.peer) + { + return Err(OpError::ConnError(ConnectionError::UnwantedConnection)); + } + + let mut backoff = backoff; + if let Some(backoff_state) = backoff.as_mut() { + tracing::warn!( + "Performing a new join, attempt {}", + backoff_state.retries() + 1 + ); + if backoff_state.sleep().await.is_none() { + tracing::error!("Max number of retries reached"); + if op_manager.ring.open_connections() == 0 { + let tx = Transaction::new::(); + return Err(OpError::MaxRetriesExceeded(tx, tx.transaction_type())); + } else { + return Ok(()); + } + } + } + + let own = op_manager.ring.connection_manager.own_location(); + let ttl = op_manager + .ring + .max_hops_to_live + .max(1) + .min(u8::MAX as usize) as u8; + let target_connections = op_manager.ring.connection_manager.min_connections; + + let (tx, mut op, msg) = ConnectOpV2::initiate_join_request( + own.clone(), + gateway.clone(), + location, + ttl, + target_connections, + ); + + op.gateway = Some(Box::new(gateway.clone())); + if let Some(backoff) = backoff { + op.backoff = Some(backoff); + } + + tracing::info!(%gateway.peer, tx = %tx, "Attempting network join using connect_v2"); + + op_manager + .notify_op_change( + NetMessage::V1(NetMessageV1::ConnectV2(msg)), + OpEnum::ConnectV2(Box::new(op)), + ) + .await?; + + Ok(()) +} + +pub(crate) async fn initial_join_procedure( + op_manager: Arc, + gateways: &[PeerKeyLocation], +) -> Result<(), OpError> { + let number_of_parallel_connections = { + let max_potential_conns_per_gw = op_manager.ring.max_hops_to_live; + let needed_to_cover_max = + op_manager.ring.connection_manager.max_connections / max_potential_conns_per_gw; + gateways.iter().take(needed_to_cover_max).count().max(2) + }; + let gateways = gateways.to_vec(); + task::spawn(async move { + if gateways.is_empty() { + tracing::warn!("No gateways available, aborting join procedure"); + return; + } + + const WAIT_TIME: u64 = 1; + const LONG_WAIT_TIME: u64 = 30; + const BOOTSTRAP_THRESHOLD: usize = 4; + + tracing::info!( + "Starting initial join procedure with {} gateways", + gateways.len() + ); + + loop { + let open_conns = op_manager.ring.open_connections(); + let unconnected_gateways: Vec<_> = + op_manager.ring.is_not_connected(gateways.iter()).collect(); + + tracing::debug!( + "Connection status: open_connections = {}, unconnected_gateways = {}", + open_conns, + unconnected_gateways.len() + ); + + let unconnected_count = unconnected_gateways.len(); + + if open_conns < BOOTSTRAP_THRESHOLD && unconnected_count > 0 { + tracing::info!( + "Below bootstrap threshold ({} < {}), attempting to connect to {} gateways", + open_conns, + BOOTSTRAP_THRESHOLD, + number_of_parallel_connections.min(unconnected_count) + ); + let select_all = FuturesUnordered::new(); + for gateway in unconnected_gateways + .into_iter() + .shuffle() + .take(number_of_parallel_connections) + { + tracing::info!(%gateway, "Attempting connection to gateway"); + let op_manager = op_manager.clone(); + select_all.push(async move { + (join_ring_request(None, gateway, &op_manager).await, gateway) + }); + } + select_all + .for_each(|(res, gateway)| async move { + if let Err(error) = res { + if !matches!( + error, + OpError::ConnError( + crate::node::ConnectionError::UnwantedConnection + ) + ) { + tracing::error!( + %gateway, + %error, + "Failed while attempting connection to gateway" + ); + } + } + }) + .await; + } else if open_conns >= BOOTSTRAP_THRESHOLD { + tracing::trace!( + "Have {} connections (>= threshold of {}), not attempting gateway connections", + open_conns, + BOOTSTRAP_THRESHOLD + ); + } + + let wait_time = if open_conns == 0 { + tracing::debug!("No connections yet, waiting {}s before retry", WAIT_TIME); + WAIT_TIME + } else if open_conns < BOOTSTRAP_THRESHOLD { + tracing::debug!( + "Have {} connections (below threshold of {}), waiting {}s", + open_conns, + BOOTSTRAP_THRESHOLD, + WAIT_TIME * 3 + ); + WAIT_TIME * 3 + } else { + tracing::trace!( + "Connection pool healthy ({} connections), waiting {}s", + open_conns, + LONG_WAIT_TIME + ); + LONG_WAIT_TIME + }; + + tokio::time::sleep(Duration::from_secs(wait_time)).await; + } + }); + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/core/src/router/isotonic_estimator.rs b/crates/core/src/router/isotonic_estimator.rs index 9a82ba228..a02cb8034 100644 --- a/crates/core/src/router/isotonic_estimator.rs +++ b/crates/core/src/router/isotonic_estimator.rs @@ -217,6 +217,7 @@ impl Adjustment { mod tests { use super::*; + use tracing::debug; // This test `test_peer_time_estimator` checks the accuracy of the `RoutingOutcomeEstimator` struct's // `estimate_retrieval_time()` method. It generates a list of 100 random events, where each event @@ -239,7 +240,7 @@ mod tests { for _ in 0..100 { let peer = PeerKeyLocation::random(); if peer.location.is_none() { - println!("Peer location is none for {peer:?}"); + debug!("Peer location is none for {peer:?}"); } let contract_location = Location::random(); events.push(simulate_positive_request(peer, contract_location)); @@ -265,7 +266,7 @@ mod tests { // Check that the errors are small let average_error = errors.iter().sum::() / errors.len() as f64; - println!("Average error: {average_error}"); + debug!("Average error: {average_error}"); assert!(average_error < 0.01); } @@ -276,7 +277,7 @@ mod tests { for _ in 0..100 { let peer = PeerKeyLocation::random(); if peer.location.is_none() { - println!("Peer location is none for {peer:?}"); + debug!("Peer location is none for {peer:?}"); } let contract_location = Location::random(); events.push(simulate_negative_request(peer, contract_location)); @@ -302,7 +303,7 @@ mod tests { // Check that the errors are small let average_error = errors.iter().sum::() / errors.len() as f64; - println!("Average error: {average_error}"); + debug!("Average error: {average_error}"); assert!(average_error < 0.01); } diff --git a/crates/core/src/test_utils.rs b/crates/core/src/test_utils.rs index 57cf57a00..a90f463d2 100644 --- a/crates/core/src/test_utils.rs +++ b/crates/core/src/test_utils.rs @@ -15,6 +15,7 @@ use freenet_stdlib::{ }; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use tracing::{error, info}; use crate::util::workspace::get_workspace_target_dir; @@ -390,9 +391,9 @@ fn compile_contract(name: &str) -> anyhow::Result> { contracts.join(name) }; - println!("module path: {contract_path:?}"); + info!("module path: {contract_path:?}"); let target = get_workspace_target_dir(); - println!( + info!( "trying to compile the test contract, target: {}", target.display() ); @@ -411,7 +412,7 @@ fn compile_contract(name: &str) -> anyhow::Result> { .join("release") .join(name.replace('-', "_")) .with_extension("wasm"); - println!("output file: {output_file:?}"); + info!("output file: {output_file:?}"); Ok(std::fs::read(output_file)?) } @@ -422,7 +423,7 @@ fn compile_delegate(name: &str) -> anyhow::Result> { delegates.join(name) }; - println!("delegate path: {delegate_path:?}"); + info!("delegate path: {delegate_path:?}"); // Check if the delegate directory exists if !delegate_path.exists() { @@ -432,7 +433,7 @@ fn compile_delegate(name: &str) -> anyhow::Result> { } let target = get_workspace_target_dir(); - println!( + info!( "trying to compile the test delegate, target: {}", target.display() ); @@ -451,7 +452,7 @@ fn compile_delegate(name: &str) -> anyhow::Result> { .join("release") .join(name.replace('-', "_")) .with_extension("wasm"); - println!("output file: {output_file:?}"); + info!("output file: {output_file:?}"); // Check if output file exists before reading if !output_file.exists() { @@ -462,7 +463,7 @@ fn compile_delegate(name: &str) -> anyhow::Result> { let wasm_data = std::fs::read(&output_file) .map_err(|e| anyhow::anyhow!("Failed to read output file {output_file:?}: {e}"))?; - println!("WASM size: {} bytes", wasm_data.len()); + info!("WASM size: {} bytes", wasm_data.len()); Ok(wasm_data) } @@ -513,7 +514,7 @@ fn compile_rust_wasm_lib(cli_config: &BuildToolConfig, work_dir: &Path) -> anyho }; let package_type = cli_config.package_type; - println!("Compiling {package_type} with rust"); + info!("Compiling {package_type} with rust"); // Set CARGO_TARGET_DIR if not already set to ensure consistent output location let mut command = Command::new("cargo"); @@ -528,7 +529,7 @@ fn compile_rust_wasm_lib(cli_config: &BuildToolConfig, work_dir: &Path) -> anyho .stderr(Stdio::piped()) .spawn() .map_err(|e| { - eprintln!("Error while executing cargo command: {e}"); + error!("Error while executing cargo command: {e}"); anyhow::anyhow!("Error while executing cargo command: {e}") })?; pipe_std_streams(child)?; diff --git a/crates/core/src/topology/request_density_tracker.rs b/crates/core/src/topology/request_density_tracker.rs index df56efa01..4820c694c 100644 --- a/crates/core/src/topology/request_density_tracker.rs +++ b/crates/core/src/topology/request_density_tracker.rs @@ -248,6 +248,7 @@ pub(crate) enum DensityMapError { mod tests { use super::*; use std::sync::RwLock; + use tracing::debug; #[test] fn test_create_density_map() { @@ -327,12 +328,12 @@ mod tests { let result = result.unwrap(); // Scan and dumb densities 0.0 to 1.0 at 0.01 intervals - println!("Location\tDensity"); + debug!("Location\tDensity"); for i in 0..100 { let location = Location::new(i as f64 / 100.0); let density = result.get_density_at(location).unwrap(); // Print and round density to 2 decimals - println!( + debug!( "{}\t{}", location.as_f64(), (density * 100.0).round() / 100.0 diff --git a/crates/core/src/tracing/mod.rs b/crates/core/src/tracing/mod.rs index bde43deda..83e1d2e91 100644 --- a/crates/core/src/tracing/mod.rs +++ b/crates/core/src/tracing/mod.rs @@ -1353,7 +1353,7 @@ pub(crate) mod tracer { { if std::env::var("TOKIO_CONSOLE").is_ok() { console_subscriber::init(); - println!( + tracing::info!( "Tokio console subscriber initialized. Connect with 'tokio-console' command." ); return Ok(()); @@ -1449,7 +1449,7 @@ pub(crate) mod tracer { } else { "freenet-core".to_string() }; - println!("setting OT collector with identifier: {identifier}"); + tracing::info!("setting OT collector with identifier: {identifier}"); // TODO: Fix OpenTelemetry version conflicts and API changes // The code below needs to be updated to work with the new OpenTelemetry API // For now, we'll just use the fmt_layer without OpenTelemetry tracing diff --git a/crates/core/src/transport/peer_connection/outbound_stream.rs b/crates/core/src/transport/peer_connection/outbound_stream.rs index 41af4909d..bd28b30d5 100644 --- a/crates/core/src/transport/peer_connection/outbound_stream.rs +++ b/crates/core/src/transport/peer_connection/outbound_stream.rs @@ -134,6 +134,7 @@ mod tests { use std::net::Ipv4Addr; use std::time::Instant; use tests::packet_data::MAX_PACKET_SIZE; + use tracing::debug; use super::{ symmetric_message::{SymmetricMessage, SymmetricMessagePayload}, @@ -265,10 +266,10 @@ mod tests { // For 10KB at 100KB/s, should take at least 100ms theoretically // But with 8 packets and 1 packet per 10ms batch, actual time is ~70-80ms // Allow margin for processing overhead and timing precision - println!( + debug!( "Transfer took: {elapsed:?}, packets sent: {packet_count}, expected: {expected_packets}" ); - println!("Bytes per packet: ~{MAX_DATA_SIZE}"); + debug!("Bytes per packet: ~{MAX_DATA_SIZE}"); assert!( elapsed.as_millis() >= 60, "Transfer completed too quickly: {elapsed:?}" diff --git a/crates/core/src/wasm_runtime/store.rs b/crates/core/src/wasm_runtime/store.rs index 15c701cbe..07044f377 100644 --- a/crates/core/src/wasm_runtime/store.rs +++ b/crates/core/src/wasm_runtime/store.rs @@ -7,6 +7,7 @@ use std::io::{self, BufReader, BufWriter, Seek, Write}; use std::path::{Path, PathBuf}; use std::time::Duration; use std::{fs::File, io::Read}; +use tracing::error; const INTERNAL_KEY: usize = 32; const TOMBSTONE_MARKER: usize = 1; @@ -325,7 +326,7 @@ fn compact_index_file(key_file_path: &Path) -> std::io::Re let mut original_reader = BufReader::new(original_file); let mut temp_writer = SafeWriter::::new(&temp_file_path, true).inspect_err(|_| { if let Err(e) = fs::remove_file(&lock_file_path) { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); } })?; @@ -340,7 +341,7 @@ fn compact_index_file(key_file_path: &Path) -> std::io::Re }; if let Err(err) = temp_writer.insert_record(store_key, value) { if let Err(e) = fs::remove_file(&lock_file_path) { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); } return Err(err); } @@ -356,7 +357,7 @@ fn compact_index_file(key_file_path: &Path) -> std::io::Re Err(other) => { // Handle other errors gracefully if let Err(e) = fs::remove_file(&lock_file_path) { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); } return Err(other); } @@ -366,7 +367,7 @@ fn compact_index_file(key_file_path: &Path) -> std::io::Re // Check if any deleted records were found; if not, skip compaction if !any_deleted { if let Err(e) = fs::remove_file(&lock_file_path) { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); } return Ok(()); } @@ -374,7 +375,7 @@ fn compact_index_file(key_file_path: &Path) -> std::io::Re // Clean up and finalize the compaction process if let Err(e) = temp_writer.flush() { if let Err(e) = fs::remove_file(&lock_file_path) { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); } return Err(e); } @@ -382,14 +383,14 @@ fn compact_index_file(key_file_path: &Path) -> std::io::Re // Replace the original file with the temporary file if let Err(e) = fs::rename(&temp_file_path, key_file_path) { if let Err(e) = fs::remove_file(&lock_file_path) { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); } return Err(e); } // Remove the lock file fs::remove_file(&lock_file_path).map_err(|e| { - eprintln!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); + error!("{}:{}: Failed to remove lock file: {e}", file!(), line!()); e })?; @@ -589,13 +590,13 @@ mod tests { create_test_data(&mut file, &key_file_path, shared_data, i); } else if let Err(err) = super::compact_index_file::(&key_file_path) { - eprintln!("Thread encountered an error during compaction: {err}"); + error!("Thread encountered an error during compaction: {err}"); return Err(err); } barrier.wait(); // compact a last time so we know what data to compare against super::compact_index_file::(&key_file_path).map_err(|err| { - eprintln!("Thread encountered an error during compaction: {err}"); + error!("Thread encountered an error during compaction: {err}"); err }) }) diff --git a/crates/core/src/wasm_runtime/tests/contract_metering.rs b/crates/core/src/wasm_runtime/tests/contract_metering.rs index 88c63a857..c4d849ebe 100644 --- a/crates/core/src/wasm_runtime/tests/contract_metering.rs +++ b/crates/core/src/wasm_runtime/tests/contract_metering.rs @@ -5,6 +5,7 @@ use crate::wasm_runtime::tests::TestSetup; use crate::wasm_runtime::{ContractExecError, RuntimeInnerError}; use freenet_stdlib::prelude::*; use std::time::Instant; +use tracing::info; const TEST_CONTRACT_METERING: &str = "test_contract_metering"; @@ -52,7 +53,7 @@ fn validate_state_metering() -> Result<(), Box> { ); let duration = time.elapsed().as_secs_f64(); - println!("Duration: {duration:.2}s"); + info!("Duration: {duration:.2}s"); assert!(duration < 5.0, "Should not timeout"); assert!( @@ -103,7 +104,7 @@ fn test_update_state_metering() -> Result<(), Box> { ); let duration = time.elapsed().as_secs_f64(); - println!("Duration: {duration:.2}s"); + info!("Duration: {duration:.2}s"); assert!(duration < 5.0, "Should not timeout"); assert!( @@ -150,7 +151,7 @@ fn test_summarize_state_metering() -> Result<(), Box> { let result = runtime.summarize_state(&contract_key, &Parameters::from([].as_ref()), &state); let duration = time.elapsed().as_secs_f64(); - println!("Duration: {duration:.2}s"); + info!("Duration: {duration:.2}s"); assert!(duration < 5.0, "Should not timeout"); assert!( @@ -202,7 +203,7 @@ fn test_get_state_delta_metering() -> Result<(), Box> { ); let duration = time.elapsed().as_secs_f64(); - println!("Duration: {duration:.2}s"); + info!("Duration: {duration:.2}s"); assert!(duration < 5.0, "Should not timeout"); assert!( diff --git a/crates/core/src/wasm_runtime/tests/mod.rs b/crates/core/src/wasm_runtime/tests/mod.rs index 955c4062e..110c49a0c 100644 --- a/crates/core/src/wasm_runtime/tests/mod.rs +++ b/crates/core/src/wasm_runtime/tests/mod.rs @@ -6,6 +6,7 @@ use freenet_stdlib::prelude::{ use crate::util::tests::get_temp_dir; use crate::util::workspace::get_workspace_target_dir; +use tracing::info; use super::{ContractStore, DelegateStore, SecretsStore}; @@ -22,7 +23,7 @@ pub(crate) fn get_test_module(name: &str) -> Result, Box Result, Box TestResult { contract_key ); if recv_state != wrapped_state { - eprintln!("State mismatch!"); - eprintln!( + tracing::error!("State mismatch!"); + tracing::error!( "Expected state: {:?}", String::from_utf8_lossy(wrapped_state.as_ref()) ); - eprintln!( + tracing::error!( "Received state: {:?}", String::from_utf8_lossy(recv_state.as_ref()) ); diff --git a/crates/core/tests/error_notification.rs b/crates/core/tests/error_notification.rs index 51edd50db..2a111ee5e 100644 --- a/crates/core/tests/error_notification.rs +++ b/crates/core/tests/error_notification.rs @@ -24,7 +24,7 @@ use std::{ }; use tokio::{select, time::timeout}; use tokio_tungstenite::connect_async; -use tracing::error; +use tracing::{error, info}; static RNG: LazyLock> = LazyLock::new(|| { use rand::SeedableRng; @@ -59,7 +59,7 @@ async fn test_get_error_notification(ctx: &mut TestContext) -> TestResult { let (ws_stream, _) = connect_async(&url).await?; let mut client = WebApi::start(ws_stream); - println!("Testing GET operation for non-existent contract (should fail with error)"); + info!("Testing GET operation for non-existent contract (should fail with error)"); // Create a contract to get its key, but we won't PUT it - so GET will fail const TEST_CONTRACT: &str = "test-contract-integration"; @@ -76,12 +76,12 @@ async fn test_get_error_notification(ctx: &mut TestContext) -> TestResult { match get_result { Ok(Ok(response)) => { // Any response is good - means we're not hanging - println!("✓ Received response (not timing out): {:?}", response); - println!("✓ Client properly notified instead of hanging"); + info!("✓ Received response (not timing out): {:?}", response); + info!("✓ Client properly notified instead of hanging"); } Ok(Err(e)) => { // WebSocket error could indicate error was delivered - println!("✓ Received error notification: {}", e); + info!("✓ Received error notification: {}", e); } Err(_) => { panic!( @@ -92,7 +92,7 @@ async fn test_get_error_notification(ctx: &mut TestContext) -> TestResult { } } - println!("Error notification test passed - client did not hang on operation failure"); + info!("Error notification test passed - client did not hang on operation failure"); // Properly close the client client @@ -126,7 +126,7 @@ async fn test_put_error_notification(ctx: &mut TestContext) -> TestResult { let (ws_stream, _) = connect_async(&url).await?; let mut client = WebApi::start(ws_stream); - println!("Testing PUT operation with invalid contract (should fail with error)"); + info!("Testing PUT operation with invalid contract (should fail with error)"); // Try to PUT with malformed contract data - this should fail // We'll use make_put with invalid state to trigger an error @@ -151,12 +151,12 @@ async fn test_put_error_notification(ctx: &mut TestContext) -> TestResult { match put_result { Ok(Ok(response)) => { // Any response is good - means we're not hanging - println!("✓ Received response (not timing out): {:?}", response); - println!("✓ Client properly notified instead of hanging"); + info!("✓ Received response (not timing out): {:?}", response); + info!("✓ Client properly notified instead of hanging"); } Ok(Err(e)) => { // WebSocket error could indicate error was delivered - println!("✓ Received error notification: {}", e); + info!("✓ Received error notification: {}", e); } Err(_) => { panic!( @@ -167,7 +167,7 @@ async fn test_put_error_notification(ctx: &mut TestContext) -> TestResult { } } - println!("PUT error notification test passed - client did not hang on operation failure"); + info!("PUT error notification test passed - client did not hang on operation failure"); // Properly close the client client @@ -201,7 +201,7 @@ async fn test_update_error_notification(ctx: &mut TestContext) -> TestResult { let (ws_stream, _) = connect_async(&url).await?; let mut client = WebApi::start(ws_stream); - println!("Testing UPDATE operation for non-existent contract (should fail with error)"); + info!("Testing UPDATE operation for non-existent contract (should fail with error)"); // Create a contract key for a contract that doesn't exist const TEST_CONTRACT: &str = "test-contract-integration"; @@ -223,12 +223,12 @@ async fn test_update_error_notification(ctx: &mut TestContext) -> TestResult { match update_result { Ok(Ok(response)) => { // Any response is good - means we're not hanging - println!("✓ Received response (not timing out): {:?}", response); - println!("✓ Client properly notified instead of hanging"); + info!("✓ Received response (not timing out): {:?}", response); + info!("✓ Client properly notified instead of hanging"); } Ok(Err(e)) => { // WebSocket error could indicate error was delivered - println!("✓ Received error notification: {}", e); + info!("✓ Received error notification: {}", e); } Err(_) => { panic!( @@ -239,7 +239,7 @@ async fn test_update_error_notification(ctx: &mut TestContext) -> TestResult { } } - println!("UPDATE error notification test passed - client did not hang on operation failure"); + info!("UPDATE error notification test passed - client did not hang on operation failure"); // Properly close the client client @@ -390,7 +390,7 @@ async fn test_connection_drop_error_notification() -> anyhow::Result<()> { tokio::select! { result = node.run() => result, _ = peer_shutdown_rx.recv() => { - println!("Peer received shutdown signal - simulating connection drop"); + info!("Peer received shutdown signal - simulating connection drop"); // We can't construct Infallible, so return an error to exit cleanly Err(anyhow::anyhow!("Peer shutdown requested")) } @@ -401,7 +401,7 @@ async fn test_connection_drop_error_notification() -> anyhow::Result<()> { // Main test logic let test = tokio::time::timeout(Duration::from_secs(90), async move { // Wait for nodes to start and connect - println!("Waiting for nodes to start up and connect..."); + info!("Waiting for nodes to start up and connect..."); tokio::time::sleep(Duration::from_secs(15)).await; // Connect a client to the gateway @@ -412,7 +412,7 @@ async fn test_connection_drop_error_notification() -> anyhow::Result<()> { let (ws_stream, _) = connect_async(&url).await?; let mut client = WebApi::start(ws_stream); - println!("Client connected to gateway"); + info!("Client connected to gateway"); // Try to PUT a contract (this should work initially) const TEST_CONTRACT: &str = "test-contract-integration"; @@ -434,7 +434,7 @@ async fn test_connection_drop_error_notification() -> anyhow::Result<()> { tokio::time::sleep(Duration::from_millis(500)).await; // Now forcibly drop the peer connection - println!("Dropping peer connection to simulate network failure..."); + info!("Dropping peer connection to simulate network failure..."); peer_shutdown_tx.send(()).await?; // Give time for the drop to be detected @@ -442,17 +442,17 @@ async fn test_connection_drop_error_notification() -> anyhow::Result<()> { // The PUT may or may not succeed depending on timing, but we should get SOME response // The key is that we don't hang indefinitely - println!("Waiting for response after connection drop..."); + info!("Waiting for response after connection drop..."); let response_result = timeout(Duration::from_secs(30), client.recv()).await; match response_result { Ok(Ok(response)) => { - println!("✓ Received response after connection drop: {:?}", response); - println!("✓ Client properly handled connection drop scenario"); + info!("✓ Received response after connection drop: {:?}", response); + info!("✓ Client properly handled connection drop scenario"); } Ok(Err(e)) => { - println!("✓ Received error notification after connection drop: {}", e); - println!("✓ Client properly notified of connection issues"); + info!("✓ Received error notification after connection drop: {}", e); + info!("✓ Client properly notified of connection issues"); } Err(_) => { panic!( @@ -463,7 +463,7 @@ async fn test_connection_drop_error_notification() -> anyhow::Result<()> { } } - println!("Connection drop error notification test passed"); + info!("Connection drop error notification test passed"); // Try to disconnect cleanly (may fail if connection is already gone) let _ = client.send(ClientRequest::Disconnect { cause: None }).await; diff --git a/crates/core/tests/isolated_node_regression.rs b/crates/core/tests/isolated_node_regression.rs index e8470c6c5..91c69a9c7 100644 --- a/crates/core/tests/isolated_node_regression.rs +++ b/crates/core/tests/isolated_node_regression.rs @@ -18,6 +18,7 @@ use freenet_stdlib::{ use std::time::Duration; use tokio::time::timeout; use tokio_tungstenite::connect_async; +use tracing::info; /// Test complete PUT-then-GET workflow on isolated node /// @@ -50,7 +51,7 @@ async fn test_isolated_node_put_get_workflow(ctx: &mut TestContext) -> TestResul let (ws_stream, _) = connect_async(&url).await?; let mut client = WebApi::start(ws_stream); - println!("Step 1: Performing PUT operation to cache contract locally"); + info!("Step 1: Performing PUT operation to cache contract locally"); // Perform PUT operation - this should cache the contract locally let put_start = std::time::Instant::now(); @@ -63,7 +64,7 @@ async fn test_isolated_node_put_get_workflow(ctx: &mut TestContext) -> TestResul match put_result { Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { assert_eq!(key, contract_key); - println!("PUT operation successful in {:?}", put_elapsed); + info!("PUT operation successful in {:?}", put_elapsed); } Ok(Ok(other)) => { panic!("Unexpected PUT response: {:?}", other); @@ -76,9 +77,9 @@ async fn test_isolated_node_put_get_workflow(ctx: &mut TestContext) -> TestResul } } - println!("Contract verified in local cache"); + info!("Contract verified in local cache"); - println!("Step 2: Performing GET operation using local cache"); + info!("Step 2: Performing GET operation using local cache"); // Now perform GET operation - should use local cache without self-routing let get_start = std::time::Instant::now(); @@ -110,7 +111,7 @@ async fn test_isolated_node_put_get_workflow(ctx: &mut TestContext) -> TestResul contract_key ); assert_eq!(recv_state, wrapped_state); - println!( + info!( "GET operation successful from local cache in {:?}", get_elapsed ); @@ -126,7 +127,7 @@ async fn test_isolated_node_put_get_workflow(ctx: &mut TestContext) -> TestResul } } - println!("PUT-then-GET workflow completed successfully without self-routing"); + info!("PUT-then-GET workflow completed successfully without self-routing"); // Properly close the client client @@ -177,7 +178,7 @@ async fn test_concurrent_get_deduplication_race(ctx: &mut TestContext) -> TestRe let (ws_stream3, _) = connect_async(&url).await?; let mut client3 = WebApi::start(ws_stream3); - println!("Step 1: PUT contract to cache it locally"); + info!("Step 1: PUT contract to cache it locally"); // Cache the contract locally using client1 make_put(&mut client1, wrapped_state.clone(), contract.clone(), false).await?; @@ -186,15 +187,15 @@ async fn test_concurrent_get_deduplication_race(ctx: &mut TestContext) -> TestRe match put_result { Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { assert_eq!(key, contract_key); - println!("Contract cached successfully"); + info!("Contract cached successfully"); } other => { panic!("PUT failed: {:?}", other); } } - println!("Step 2: Concurrent GET requests from multiple clients"); - println!("This tests the deduplication race condition from issue #1886"); + info!("Step 2: Concurrent GET requests from multiple clients"); + info!("This tests the deduplication race condition from issue #1886"); // Send GET requests concurrently from all clients // The contract is cached, so these will complete instantly @@ -234,26 +235,26 @@ async fn test_concurrent_get_deduplication_race(ctx: &mut TestContext) -> TestRe )) => { assert_eq!(key, contract_key); assert_eq!(state, wrapped_state); - println!("Client {}: Received GET response", client_num); + info!("Client {}: Received GET response", client_num); true } Ok((_, Ok(Ok(other)))) => { - println!("Client {}: Unexpected response: {:?}", client_num, other); + info!("Client {}: Unexpected response: {:?}", client_num, other); false } Ok((_, Ok(Err(e)))) => { - println!("Client {}: Error: {}", client_num, e); + info!("Client {}: Error: {}", client_num, e); false } Ok((_, Err(_))) => { - println!( + info!( "Client {}: TIMEOUT - This is the bug from issue #1886!", client_num ); false } Err(e) => { - println!("Client {}: Failed to send request: {}", client_num, e); + info!("Client {}: Failed to send request: {}", client_num, e); false } } @@ -270,7 +271,7 @@ async fn test_concurrent_get_deduplication_race(ctx: &mut TestContext) -> TestRe "All clients should receive GET responses. Failures indicate issue #1886 race condition." ); - println!("All clients received responses - no race condition detected"); + info!("All clients received responses - no race condition detected"); // Cleanup client1 @@ -322,7 +323,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes let (ws_stream2, _) = connect_async(&url).await?; let mut client2 = WebApi::start(ws_stream2); - println!("Step 1: Performing PUT operation to cache contract locally"); + info!("Step 1: Performing PUT operation to cache contract locally"); // Perform PUT operation - this should cache the contract locally make_put(&mut client1, wrapped_state.clone(), contract.clone(), false).await?; @@ -333,7 +334,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes match put_result { Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { assert_eq!(key, contract_key); - println!("PUT operation successful"); + info!("PUT operation successful"); } Ok(Ok(other)) => { panic!("Unexpected PUT response: {:?}", other); @@ -346,7 +347,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes } } - println!("Step 2: Testing SUBSCRIBE operation on locally cached contract"); + info!("Step 2: Testing SUBSCRIBE operation on locally cached contract"); // Subscribe first client to the contract - should work with local contract let subscribe_start = std::time::Instant::now(); @@ -363,7 +364,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes subscribed, }))) => { assert_eq!(key, contract_key); - println!( + info!( "Client 1: SUBSCRIBE operation successful in {:?}", subscribe_elapsed ); @@ -388,7 +389,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes } } - println!("Step 3: Testing second client subscription"); + info!("Step 3: Testing second client subscription"); // Subscribe second client - verifies multiple clients can subscribe locally make_subscribe(&mut client2, contract_key).await?; @@ -401,7 +402,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes subscribed, }))) => { assert_eq!(key, contract_key); - println!("Client 2: SUBSCRIBE operation successful"); + info!("Client 2: SUBSCRIBE operation successful"); assert!(subscribed); } _ => { @@ -414,7 +415,7 @@ async fn test_isolated_node_local_subscription(ctx: &mut TestContext) -> TestRes // has been validated - both clients successfully receive SubscribeResponse. // Update notification delivery can be tested once UPDATE is fixed for isolated nodes. - println!( + info!( "Local subscription test completed successfully - both clients received SubscribeResponse" ); @@ -462,7 +463,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul let (ws_stream, _) = connect_async(&url).await?; let mut client = WebApi::start(ws_stream); - println!("Step 1: Performing PUT operation to cache contract locally"); + info!("Step 1: Performing PUT operation to cache contract locally"); // Perform PUT operation - this caches the contract locally let put_start = std::time::Instant::now(); @@ -481,7 +482,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul match put_result { Ok(Ok(HostResponse::ContractResponse(ContractResponse::PutResponse { key }))) => { assert_eq!(key, contract_key); - println!("PUT operation successful in {:?}", put_elapsed); + info!("PUT operation successful in {:?}", put_elapsed); } Ok(Ok(other)) => { panic!("Unexpected PUT response: {:?}", other); @@ -494,7 +495,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul } } - println!("Step 2: Performing UPDATE operation with new state"); + info!("Step 2: Performing UPDATE operation with new state"); // Create updated state (add a todo item) let updated_state = freenet::test_utils::create_todo_list_with_item("Test task"); @@ -522,7 +523,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul key, .. }))) => { assert_eq!(key, contract_key); - println!("UPDATE operation successful in {:?}", update_elapsed); + info!("UPDATE operation successful in {:?}", update_elapsed); } Ok(Ok(other)) => { panic!("Unexpected UPDATE response: {:?}", other); @@ -535,7 +536,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul } } - println!("Step 3: Performing GET operation to verify updated state"); + info!("Step 3: Performing GET operation to verify updated state"); // Verify the state was updated by performing a GET let get_start = std::time::Instant::now(); @@ -552,7 +553,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul // Parse both states to verify the tasks were updated correctly // Note: UPDATE operations may modify the version number, so we check the tasks array let recv_str = String::from_utf8_lossy(recv_state.as_ref()); - println!("Received state after UPDATE: {}", recv_str); + info!("Received state after UPDATE: {}", recv_str); // Verify the state contains the expected task assert!( @@ -570,7 +571,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul "Tasks array should not be empty after update" ); - println!( + info!( "GET operation successful, state correctly updated in {:?}", get_elapsed ); @@ -586,7 +587,7 @@ async fn test_isolated_node_update_operation(ctx: &mut TestContext) -> TestResul } } - println!("PUT-UPDATE-GET workflow completed successfully on isolated node"); + info!("PUT-UPDATE-GET workflow completed successfully on isolated node"); // Properly close the client client diff --git a/crates/core/tests/operations.rs b/crates/core/tests/operations.rs index 06cac201b..a554a2085 100644 --- a/crates/core/tests/operations.rs +++ b/crates/core/tests/operations.rs @@ -1785,7 +1785,7 @@ async fn test_delegate_request(ctx: &mut TestContext) -> TestResult { key, delegate_key, "Delegate key mismatch in register response" ); - println!("Successfully registered delegate with key: {key}"); + tracing::info!("Successfully registered delegate with key: {key}"); } other => { bail!( @@ -1857,7 +1857,7 @@ async fn test_delegate_request(ctx: &mut TestContext) -> TestResult { "Response data doesn't match expected value" ); - println!("Successfully received and verified delegate response"); + tracing::info!("Successfully received and verified delegate response"); } } } diff --git a/crates/core/tests/redb_migration.rs b/crates/core/tests/redb_migration.rs index 2afe1bdc6..def6ff72b 100644 --- a/crates/core/tests/redb_migration.rs +++ b/crates/core/tests/redb_migration.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use tempfile::TempDir; +use tracing::info; /// Test that verifies automatic migration from redb v2 to v3 format /// @@ -38,8 +39,8 @@ async fn test_automatic_migration_from_v2_to_v3() -> Result<(), Box Result<(), Box Date: Thu, 6 Nov 2025 17:00:15 +0100 Subject: [PATCH 24/26] refactor(connect): ignore legacy connect messages --- crates/core/src/node/mod.rs | 89 ++++----------------------- crates/core/src/operations/connect.rs | 1 + 2 files changed, 12 insertions(+), 78 deletions(-) diff --git a/crates/core/src/node/mod.rs b/crates/core/src/node/mod.rs index 669078d3d..240b66832 100644 --- a/crates/core/src/node/mod.rs +++ b/crates/core/src/node/mod.rs @@ -41,7 +41,6 @@ use crate::{ local_node::Executor, message::{InnerMessage, NetMessage, Transaction, TransactionType}, operations::{ - connect::{self, ConnectOp}, connect_v2::{self, ConnectOpV2}, get, put, subscribe, update, OpEnum, OpError, OpOutcome, }, @@ -693,28 +692,12 @@ async fn process_message_v1( for i in 0..MAX_RETRIES { tracing::debug!(?tx, "Processing operation, iteration: {i}"); match msg { - NetMessageV1::Connect(ref op) => { - let parent_span = tracing::Span::current(); - let span = tracing::info_span!( - parent: parent_span, - "handle_connect_op_request", + NetMessageV1::Connect(ref _op) => { + tracing::warn!( transaction = %msg.id(), - tx_type = %msg.id().transaction_type() + "Ignoring legacy NetMessageV1::Connect message during ConnectV2 migration" ); - let op_result = - handle_op_request::(&op_manager, &mut conn_manager, op) - .instrument(span) - .await; - - handle_op_not_available!(op_result); - return report_result( - tx, - op_result, - &op_manager, - executor_callback, - &mut *event_listener, - ) - .await; + return; } NetMessageV1::ConnectV2(ref op) => { let parent_span = tracing::Span::current(); @@ -876,42 +859,12 @@ where tracing::debug!(?tx, "Processing pure network operation, iteration: {i}"); match msg { - NetMessageV1::Connect(ref op) => { - let parent_span = tracing::Span::current(); - let span = tracing::info_span!( - parent: parent_span, - "handle_connect_op_request", + NetMessageV1::Connect(ref _op) => { + tracing::warn!( transaction = %msg.id(), - tx_type = %msg.id().transaction_type() + "Ignoring legacy NetMessageV1::Connect message during ConnectV2 migration" ); - let op_result = - handle_op_request::(&op_manager, &mut conn_manager, op) - .instrument(span) - .await; - - if let Err(OpError::OpNotAvailable(state)) = &op_result { - match state { - OpNotAvailable::Running => { - tracing::debug!("Pure network: Operation still running"); - tokio::time::sleep(Duration::from_micros(1_000)).await; - continue; - } - OpNotAvailable::Completed => { - tracing::debug!("Pure network: Operation already completed"); - return Ok(None); - } - } - } - - // Pure network result processing - no client handling - return handle_pure_network_result( - tx, - op_result, - &op_manager, - executor_callback, - &mut *event_listener, - ) - .await; + return Ok(None); } NetMessageV1::ConnectV2(ref op) => { let parent_span = tracing::Span::current(); @@ -1213,20 +1166,6 @@ async fn handle_aborted_op( // is useless without connecting to the network, we will retry with exponential backoff // if necessary match op_manager.pop(&tx) { - // only keep attempting to connect if the node hasn't got enough connections yet - Ok(Some(OpEnum::Connect(op))) - if op.has_backoff() - && op_manager.ring.open_connections() - < op_manager.ring.connection_manager.min_connections => - { - let ConnectOp { - gateway, backoff, .. - } = *op; - if let Some(gateway) = gateway { - tracing::warn!("Retry connecting to gateway {}", gateway.peer); - connect_v2::join_ring_request(backoff, &gateway, op_manager).await?; - } - } Ok(Some(OpEnum::ConnectV2(op))) if op.has_backoff() && op_manager.ring.open_connections() @@ -1238,15 +1177,6 @@ async fn handle_aborted_op( connect_v2::join_ring_request(None, &gateway, op_manager).await?; } } - Ok(Some(OpEnum::Connect(_))) => { - // if no connections were achieved just fail - if op_manager.ring.open_connections() == 0 && op_manager.ring.is_gateway() { - tracing::warn!("Retrying joining the ring with an other gateway"); - if let Some(gateway) = gateways.iter().shuffle().next() { - connect_v2::join_ring_request(None, gateway, op_manager).await? - } - } - } Ok(Some(OpEnum::ConnectV2(_))) => { if op_manager.ring.open_connections() == 0 && op_manager.ring.is_gateway() { tracing::warn!("Retrying joining the ring with an other gateway"); @@ -1255,6 +1185,9 @@ async fn handle_aborted_op( } } } + Ok(Some(other)) => { + op_manager.push(tx, other).await?; + } _ => {} } } diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 0c36d2202..d635d5a28 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -48,6 +48,7 @@ impl ConnectOp { } } + #[allow(dead_code)] pub fn has_backoff(&self) -> bool { self.backoff.is_some() } From 08ca88cbb7d420d78ff80293177a6967aad459a5 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Fri, 7 Nov 2025 00:01:54 +0100 Subject: [PATCH 25/26] fix: make river 2-peer and 6-peer tests pass --- .../core/src/node/network_bridge/handshake.rs | 1 + .../src/node/network_bridge/handshake_v2.rs | 198 ++- .../src/node/network_bridge/p2p_protoc.rs | 1107 +++++++++-------- .../node/network_bridge/priority_select.rs | 21 +- .../network_bridge/priority_select/tests.rs | 2 +- crates/core/src/node/op_state_manager.rs | 1 + crates/core/src/operations/connect_v2.rs | 151 ++- crates/core/src/operations/get.rs | 259 ++-- crates/core/src/operations/subscribe.rs | 27 +- crates/core/src/ring/connection_manager.rs | 54 + crates/core/src/ring/mod.rs | 39 +- 11 files changed, 1084 insertions(+), 776 deletions(-) diff --git a/crates/core/src/node/network_bridge/handshake.rs b/crates/core/src/node/network_bridge/handshake.rs index 051acebdb..821a4a68f 100644 --- a/crates/core/src/node/network_bridge/handshake.rs +++ b/crates/core/src/node/network_bridge/handshake.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] //! Handles initial connection handshake. use parking_lot::RwLock; use std::{ diff --git a/crates/core/src/node/network_bridge/handshake_v2.rs b/crates/core/src/node/network_bridge/handshake_v2.rs index 6fe9ffac8..d4b8b2f55 100644 --- a/crates/core/src/node/network_bridge/handshake_v2.rs +++ b/crates/core/src/node/network_bridge/handshake_v2.rs @@ -1,114 +1,110 @@ -//! Placeholder implementation for the upcoming ConnectV2 handshake flow. +//! Minimal handshake driver for the ConnectV2 pipeline. //! -//! The new handshake pipeline is still under active development. For now we keep a -//! compile-time stub so the surrounding modules that reference this file continue -//! to build while we flesh out the full state machine. +//! The legacy handshake logic orchestrated the multi-stage `Connect` operation. With the +//! streamlined ConnectV2 state machine we only need a lightweight adapter that wires transport +//! connection attempts to/from the event loop. Higher-level routing decisions now live inside +//! `ConnectOpV2`. +use std::collections::HashMap; +use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use std::time::Duration; use futures::Stream; +use parking_lot::RwLock; use tokio::sync::mpsc; use crate::dev_tool::{Location, PeerId, Transaction}; +use crate::node::network_bridge::ConnectionError; use crate::ring::ConnectionManager; use crate::router::Router; use crate::transport::{InboundConnectionHandler, OutboundConnectionHandler, PeerConnection}; -/// Events that will eventually be emitted by the ConnectV2 handshake handler. +/// Events emitted by the handshake driver. #[derive(Debug)] -#[allow(dead_code)] pub(crate) enum Event { + /// A remote peer initiated or completed a connection to us. InboundConnection { - transaction: Transaction, + transaction: Option, + peer: Option, connection: PeerConnection, - joiner: PeerId, courtesy: bool, }, + /// An outbound connection attempt succeeded. OutboundEstablished { transaction: Transaction, peer: PeerId, connection: PeerConnection, courtesy: bool, }, + /// An outbound connection attempt failed. OutboundFailed { transaction: Transaction, peer: PeerId, + error: ConnectionError, courtesy: bool, }, } -/// Commands delivered from the event loop into the handshake handler. +/// Commands delivered from the event loop into the handshake driver. #[derive(Debug)] -#[allow(dead_code)] pub(crate) enum Command { + /// Initiate a transport connection to `peer`. Connect { peer: PeerId, transaction: Transaction, courtesy: bool, }, - DropConnection { + /// Register expectation for an inbound connection from `peer`. + ExpectInbound { peer: PeerId, + transaction: Option, + courtesy: bool, }, + /// Remove state associated with `peer`. + DropConnection { peer: PeerId }, } -#[allow(dead_code)] +#[derive(Clone)] pub(crate) struct CommandSender(mpsc::Sender); impl CommandSender { - #[allow(dead_code)] pub async fn send(&self, cmd: Command) -> Result<(), mpsc::error::SendError> { + tracing::info!(?cmd, "handshake_v2: sending command"); self.0.send(cmd).await } } -/// Temporary stub implementation that just keeps channels alive. -#[allow(dead_code)] +/// Stream wrapper around the asynchronous handshake driver. pub(crate) struct HandshakeHandler { - #[allow(dead_code)] - inbound: InboundConnectionHandler, - #[allow(dead_code)] - outbound: OutboundConnectionHandler, - #[allow(dead_code)] - connection_manager: ConnectionManager, - #[allow(dead_code)] - router: Arc, - #[allow(dead_code)] - this_location: Option, - #[allow(dead_code)] - is_gateway: bool, - #[allow(dead_code)] - peer_ready: Option>, - commands_rx: mpsc::Receiver, + events_rx: mpsc::Receiver, } -#[allow(clippy::too_many_arguments)] -#[allow(dead_code)] impl HandshakeHandler { #[allow(clippy::too_many_arguments)] pub fn new( inbound: InboundConnectionHandler, outbound: OutboundConnectionHandler, - connection_manager: ConnectionManager, - router: Arc, - this_location: Option, - is_gateway: bool, + _connection_manager: ConnectionManager, + _router: Arc>, + _this_location: Option, + _is_gateway: bool, peer_ready: Option>, ) -> (Self, CommandSender) { - let (tx, rx) = mpsc::channel(1); + let (cmd_tx, cmd_rx) = mpsc::channel(128); + let (event_tx, event_rx) = mpsc::channel(128); + + tokio::spawn(async move { + run_driver(inbound, outbound, cmd_rx, event_tx, peer_ready).await; + }); + ( HandshakeHandler { - inbound, - outbound, - connection_manager, - router, - this_location, - is_gateway, - peer_ready, - commands_rx: rx, + events_rx: event_rx, }, - CommandSender(tx), + CommandSender(cmd_tx), ) } } @@ -117,10 +113,112 @@ impl Stream for HandshakeHandler { type Item = Event; fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match Pin::new(&mut self.commands_rx).poll_recv(cx) { - Poll::Ready(Some(_cmd)) => Poll::Pending, - Poll::Ready(None) => Poll::Ready(None), - Poll::Pending => Poll::Pending, + Pin::new(&mut self.events_rx).poll_recv(cx) + } +} + +#[derive(Debug)] +struct ExpectedInbound { + peer: PeerId, + transaction: Option, + courtesy: bool, +} + +async fn run_driver( + mut inbound: InboundConnectionHandler, + outbound: OutboundConnectionHandler, + mut commands_rx: mpsc::Receiver, + events_tx: mpsc::Sender, + peer_ready: Option>, +) { + use tokio::select; + + let mut expected_inbound: HashMap = HashMap::new(); + + loop { + select! { + command = commands_rx.recv() => match command { + Some(Command::Connect { peer, transaction, courtesy }) => { + spawn_outbound(outbound.clone(), events_tx.clone(), peer, transaction, courtesy, peer_ready.clone()); + } + Some(Command::ExpectInbound { peer, transaction, courtesy }) => { + expected_inbound.insert(peer.addr, ExpectedInbound { peer, transaction, courtesy }); + } + Some(Command::DropConnection { peer }) => { + expected_inbound.remove(&peer.addr); + } + None => break, + }, + inbound_conn = inbound.next_connection() => { + match inbound_conn { + Some(conn) => { + if let Some(flag) = &peer_ready { + flag.store(true, std::sync::atomic::Ordering::SeqCst); + } + + let remote_addr = conn.remote_addr(); + let entry = expected_inbound.remove(&remote_addr); + let (peer, transaction, courtesy) = if let Some(entry) = entry { + (Some(entry.peer), entry.transaction, entry.courtesy) + } else { + (None, None, false) + }; + + if events_tx.send(Event::InboundConnection { + transaction, + peer, + connection: conn, + courtesy, + }).await.is_err() { + break; + } + } + None => break, + } + } } } } + +fn spawn_outbound( + outbound: OutboundConnectionHandler, + events_tx: mpsc::Sender, + peer: PeerId, + transaction: Transaction, + courtesy: bool, + peer_ready: Option>, +) { + tokio::spawn(async move { + let peer_for_connect = peer.clone(); + let mut handler = outbound; + let connect_future = handler + .connect(peer_for_connect.pub_key.clone(), peer_for_connect.addr) + .await; + let result: Result = + match tokio::time::timeout(Duration::from_secs(10), connect_future).await { + Ok(res) => res.map_err(|err| err.into()), + Err(_) => Err(ConnectionError::Timeout), + }; + + if let Some(flag) = &peer_ready { + flag.store(true, std::sync::atomic::Ordering::SeqCst); + } + + let event = match result { + Ok(connection) => Event::OutboundEstablished { + transaction, + peer: peer.clone(), + connection, + courtesy, + }, + Err(error) => Event::OutboundFailed { + transaction, + peer: peer.clone(), + error, + courtesy, + }, + }; + + let _ = events_tx.send(event).await; + }); +} diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index c55045f3e..4812db655 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -6,7 +6,7 @@ use futures::FutureExt; use futures::StreamExt; use std::convert::Infallible; use std::future::Future; -use std::net::{IpAddr, SocketAddr}; +use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::pin::Pin; use std::time::Duration; use std::{ @@ -15,27 +15,24 @@ use std::{ }; use tokio::net::UdpSocket; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::oneshot::{self}; use tokio::time::timeout; use tracing::Instrument; use super::{ConnectionError, EventLoopNotificationsReceiver, NetworkBridge}; use crate::contract::{ContractHandlerEvent, WaitingTransaction}; use crate::message::{NetMessageV1, QueryResult}; -use crate::node::network_bridge::handshake::{ - Event as HandshakeEvent, ForwardInfo, HandshakeError, HandshakeEventStream, HandshakeHandler, - HanshakeHandlerMsg, OutboundMessage, +use crate::node::network_bridge::handshake_v2::{ + Command as HandshakeCommand, CommandSender as HandshakeCommandSender, Event as HandshakeEvent, + HandshakeHandler, }; use crate::node::network_bridge::priority_select; use crate::node::subscribe::SubscribeMsg; use crate::node::{MessageProcessor, PeerId}; -use crate::operations::{ - connect::ConnectMsg, connect_v2::ConnectMsgV2, get::GetMsg, put::PutMsg, update::UpdateMsg, -}; +use crate::operations::{connect_v2::ConnectMsgV2, get::GetMsg, put::PutMsg, update::UpdateMsg}; use crate::ring::Location; use crate::transport::{ create_connection_handler, OutboundConnectionHandler, PeerConnection, TransportError, - TransportKeypair, + TransportKeypair, TransportPublicKey, }; use crate::{ client_events::ClientId, @@ -150,6 +147,36 @@ impl P2pConnManager { let gateways = config.get_gateways()?; let key_pair = config.key_pair.clone(); + + // Initialize our peer identity before any connection attempts so join requests can + // reference the correct address. + let advertised_addr = { + let advertised_ip = config + .peer_id + .as_ref() + .map(|peer| peer.addr.ip()) + .or(config.config.network_api.public_address) + .unwrap_or_else(|| { + if listener_ip.is_unspecified() { + IpAddr::V4(Ipv4Addr::LOCALHOST) + } else { + listener_ip + } + }); + let advertised_port = config + .peer_id + .as_ref() + .map(|peer| peer.addr.port()) + .or(config.config.network_api.public_port) + .unwrap_or(listen_port); + SocketAddr::new(advertised_ip, advertised_port) + }; + bridge + .op_manager + .ring + .connection_manager + .try_set_peer_key(advertised_addr); + Ok(P2pConnManager { gateways, bridge, @@ -229,7 +256,7 @@ impl P2pConnManager { None }; - let (handshake_handler, handshake_handler_msg, outbound_message) = HandshakeHandler::new( + let (handshake_handler, handshake_cmd_sender) = HandshakeHandler::new( inbound_conn_handler, outbound_conn_handler.clone(), bridge.op_manager.ring.connection_manager.clone(), @@ -239,15 +266,11 @@ impl P2pConnManager { peer_ready, ); - // Create priority select stream ONCE by moving ownership - it stays alive across iterations. - // This fixes the lost wakeup race condition (issue #1932). - // HandshakeEventStream wraps HandshakeHandler and implements Stream properly. - let handshake_stream = HandshakeEventStream::new(handshake_handler); let select_stream = priority_select::ProductionPrioritySelectStream::new( notification_channel.notifications_receiver, notification_channel.op_execution_receiver, conn_bridge_rx, - handshake_stream, + handshake_handler, node_controller, client_wait_for_transaction, executor_listener, @@ -283,7 +306,7 @@ impl P2pConnManager { result, &mut state, &mut select_stream, - &handshake_handler_msg, + &handshake_cmd_sender, ) .await?; @@ -298,13 +321,8 @@ impl P2pConnManager { peer = %ctx.bridge.op_manager.ring.connection_manager.get_peer_key().unwrap(), "Received inbound message from peer - processing" ); - ctx.handle_inbound_message( - msg, - &outbound_message, - &op_manager, - &mut state, - ) - .await?; + ctx.handle_inbound_message(msg, &op_manager, &mut state) + .await?; } ConnEvent::OutboundMessage(NetMessage::V1(NetMessageV1::Aborted(tx))) => { // TODO: handle aborted transaction as internal message @@ -335,13 +353,8 @@ impl P2pConnManager { "BUG: OutboundMessage targets self! This indicates a routing logic error - messages should not reach OutboundMessage handler if they target self" ); // Convert to InboundMessage and process locally - ctx.handle_inbound_message( - msg, - &outbound_message, - &op_manager, - &mut state, - ) - .await?; + ctx.handle_inbound_message(msg, &op_manager, &mut state) + .await?; continue; } @@ -354,7 +367,25 @@ impl P2pConnManager { // IMPORTANT: Use a single get() call to avoid TOCTOU race // between contains_key() and get(). The connection can be // removed by another task between those two calls. - let peer_connection = ctx.connections.get(&target_peer.peer); + let peer_connection = ctx + .connections + .get(&target_peer.peer) + .or_else(|| { + if target_peer.peer.addr.ip().is_unspecified() { + ctx.connection_entry_by_pub_key(&target_peer.peer.pub_key) + .map(|(existing_peer, sender)| { + tracing::info!( + tx = %msg.id(), + target_peer = %target_peer.peer, + resolved_addr = %existing_peer.addr, + "Resolved outbound connection using peer public key due to unspecified address" + ); + sender + }) + } else { + None + } + }); tracing::debug!( tx = %msg.id(), self_peer = %ctx.bridge.op_manager.ring.connection_manager.pub_key, @@ -500,11 +531,17 @@ impl P2pConnManager { ctx.connections.remove(&peer); // Notify handshake handler to clean up - if let Err(e) = handshake_handler_msg - .drop_connection(peer.clone()) + if let Err(error) = handshake_cmd_sender + .send(HandshakeCommand::DropConnection { + peer: peer.clone(), + }) .await { - tracing::warn!(%peer, error = ?e, "Failed to drop connection during cleanup"); + tracing::warn!( + %peer, + ?error, + "Failed to drop connection during cleanup" + ); } } @@ -521,9 +558,7 @@ impl P2pConnManager { // Best effort notification - ignore errors since we're shutting down anyway // The callback sender will handle cleanup on their side for mut callback in callbacks.drain(..) { - let _ = callback - .send_result(Err(HandshakeError::ChannelClosed)) - .await; + let _ = callback.send_result(Err(())).await; } } @@ -535,44 +570,82 @@ impl P2pConnManager { ConnEvent::NodeAction(action) => match action { NodeEvent::DropConnection(peer) => { tracing::debug!(self_peer = %ctx.bridge.op_manager.ring.connection_manager.pub_key, %peer, conn_map_size = ctx.connections.len(), "[CONN_TRACK] REMOVE: DropConnection event - removing from connections HashMap"); + if let Err(error) = handshake_cmd_sender + .send(HandshakeCommand::DropConnection { peer: peer.clone() }) + .await + { + tracing::warn!( + %peer, + ?error, + "Failed to enqueue DropConnection command" + ); + } if let Some(conn) = ctx.connections.remove(&peer) { // TODO: review: this could potentially leave garbage tasks in the background with peer listener - timeout( + match timeout( Duration::from_secs(1), conn.send(Right(ConnEvent::NodeAction( NodeEvent::DropConnection(peer), ))), ) .await - .inspect_err( - |error| { + { + Ok(Ok(())) => {} + Ok(Err(send_error)) => { + tracing::error!( + ?send_error, + "Failed to send drop connection message" + ); + } + Err(elapsed) => { tracing::error!( - "Failed to send drop connection message: {:?}", - error + ?elapsed, + "Timeout while sending drop connection message" ); - }, - )??; + } + } } } NodeEvent::ConnectPeer { peer, tx, callback, - is_gw, + is_gw: courtesy, } => { + tracing::info!( + tx = %tx, + remote = %peer, + remote_addr = %peer.addr, + courtesy, + "NodeEvent::ConnectPeer received" + ); ctx.handle_connect_peer( peer, Box::new(callback), tx, - &handshake_handler_msg, + &handshake_cmd_sender, &mut state, - is_gw, + courtesy, ) .await?; } NodeEvent::ExpectPeerConnection { peer } => { - tracing::debug!(%peer, "ExpectPeerConnection event received; registering inbound expectation"); + tracing::debug!(%peer, "ExpectPeerConnection event received; registering inbound expectation via handshake driver"); state.outbound_handler.expect_incoming(peer.addr); + if let Err(error) = handshake_cmd_sender + .send(HandshakeCommand::ExpectInbound { + peer: peer.clone(), + transaction: None, + courtesy: false, + }) + .await + { + tracing::warn!( + %peer, + ?error, + "Failed to enqueue ExpectInbound command; inbound connection may be dropped" + ); + } } NodeEvent::SendMessage { target, msg } => { // Send the message to the target peer over the network @@ -585,17 +658,26 @@ impl P2pConnManager { } NodeEvent::QueryConnections { callback } => { let connections = ctx.connections.keys().cloned().collect(); - timeout( + match timeout( Duration::from_secs(1), callback.send(QueryResult::Connections(connections)), ) .await - .inspect_err(|error| { - tracing::error!( - "Failed to send connections query result: {:?}", - error - ); - })??; + { + Ok(Ok(())) => {} + Ok(Err(send_error)) => { + tracing::error!( + ?send_error, + "Failed to send connections query result" + ); + } + Err(elapsed) => { + tracing::error!( + ?elapsed, + "Timeout while sending connections query result" + ); + } + } } NodeEvent::QuerySubscriptions { callback } => { // Get network subscriptions from OpManager @@ -638,17 +720,26 @@ impl P2pConnManager { connected_peers: connections, }; - timeout( + match timeout( Duration::from_secs(1), callback.send(QueryResult::NetworkDebug(debug_info)), ) .await - .inspect_err(|error| { - tracing::error!( - "Failed to send subscriptions query result: {:?}", - error - ); - })??; + { + Ok(Ok(())) => {} + Ok(Err(send_error)) => { + tracing::error!( + ?send_error, + "Failed to send subscriptions query result" + ); + } + Err(elapsed) => { + tracing::error!( + ?elapsed, + "Timeout while sending subscriptions query result" + ); + } + } } NodeEvent::QueryNodeDiagnostics { config, callback } => { use freenet_stdlib::client_api::{ @@ -800,17 +891,26 @@ impl P2pConnManager { } } - timeout( + match timeout( Duration::from_secs(2), callback.send(QueryResult::NodeDiagnostics(response)), ) .await - .inspect_err(|error| { - tracing::error!( - "Failed to send node diagnostics query result: {:?}", - error - ); - })??; + { + Ok(Ok(())) => {} + Ok(Err(send_error)) => { + tracing::error!( + ?send_error, + "Failed to send node diagnostics query result" + ); + } + Err(elapsed) => { + tracing::error!( + ?elapsed, + "Timeout while sending node diagnostics query result" + ); + } + } } NodeEvent::TransactionTimedOut(tx) => { // Clean up client subscription to prevent memory leak @@ -896,7 +996,7 @@ impl P2pConnManager { result: priority_select::SelectResult, state: &mut EventListenerState, select_stream: &mut priority_select::ProductionPrioritySelectStream, - handshake_handler_msg: &HanshakeHandlerMsg, + handshake_commands: &HandshakeCommandSender, ) -> anyhow::Result { let peer_id = &self.bridge.op_manager.ring.connection_manager.pub_key; @@ -922,7 +1022,7 @@ impl P2pConnManager { peer = %peer_id, "PrioritySelect: peer_connections READY" ); - self.handle_peer_connection_msg(msg, state, select_stream, handshake_handler_msg) + self.handle_peer_connection_msg(msg, state, select_stream, handshake_commands) .await } SelectResult::ConnBridge(msg) => { @@ -938,23 +1038,16 @@ impl P2pConnManager { "PrioritySelect: handshake event READY" ); match result { - Ok(event) => { - self.handle_handshake_action( - event, - state, - select_stream, - handshake_handler_msg, - ) - .await?; + Some(event) => { + self.handle_handshake_action(event, state, select_stream) + .await?; Ok(EventResult::Continue) } - Err(handshake_error) => { + None => { tracing::warn!( - ?handshake_error, - "Handshake handler yielded error; cleaning up pending connections" + "Handshake handler stream closed; notifying pending callbacks" ); - self.handle_handshake_failure(handshake_error, state) - .await?; + self.handle_handshake_stream_closed(state).await?; Ok(EventResult::Continue) } } @@ -986,7 +1079,6 @@ impl P2pConnManager { async fn handle_inbound_message( &self, msg: NetMessage, - outbound_message: &OutboundMessage, op_manager: &Arc, state: &mut EventListenerState, ) -> anyhow::Result<()> { @@ -995,12 +1087,7 @@ impl P2pConnManager { handle_aborted_op(tx, op_manager, &self.gateways).await?; } msg => { - if let Some(addr) = state.transient_conn.get(msg.id()) { - // Forward message to transient joiner - outbound_message.send_to(*addr, msg).await?; - } else { - self.process_message(msg, op_manager, None, state).await; - } + self.process_message(msg, op_manager, None, state).await; } } Ok(()) @@ -1055,34 +1142,82 @@ impl P2pConnManager { ); } + fn connection_entry_by_pub_key( + &self, + pub_key: &TransportPublicKey, + ) -> Option<(&PeerId, &PeerConnChannelSender)> { + self.connections + .iter() + .find(|(peer_id, _)| peer_id.pub_key == *pub_key) + } + async fn handle_connect_peer( &mut self, peer: PeerId, mut callback: Box, tx: Transaction, - handshake_handler_msg: &HanshakeHandlerMsg, + handshake_commands: &HandshakeCommandSender, state: &mut EventListenerState, - is_gw: bool, + courtesy: bool, ) -> anyhow::Result<()> { - tracing::info!(tx = %tx, remote = %peer, "Connecting to peer"); + let mut peer = peer; + let mut peer_addr = peer.addr; + + if peer_addr.ip().is_unspecified() { + if let Some((existing_peer, _)) = self.connection_entry_by_pub_key(&peer.pub_key) { + peer_addr = existing_peer.addr; + peer.addr = existing_peer.addr; + tracing::info!( + tx = %tx, + remote = %peer, + fallback_addr = %peer_addr, + courtesy, + "ConnectPeer provided unspecified address; using existing connection address" + ); + } else { + tracing::debug!( + tx = %tx, + courtesy, + "ConnectPeer received unspecified address without existing connection reference" + ); + } + } + + tracing::info!( + tx = %tx, + remote = %peer, + remote_addr = %peer_addr, + courtesy, + "Connecting to peer" + ); if let Some(blocked_addrs) = &self.blocked_addresses { if blocked_addrs.contains(&peer.addr) { - tracing::info!(tx = %tx, remote = %peer.addr, "Outgoing connection to peer blocked by local policy"); - // Don't propagate channel closed errors when notifying about blocked connections + tracing::info!( + tx = %tx, + remote = %peer.addr, + "Outgoing connection to peer blocked by local policy" + ); callback - .send_result(Err(HandshakeError::ConnectionError( - crate::node::network_bridge::ConnectionError::AddressBlocked(peer.addr), - ))) + .send_result(Err(())) .await - .inspect_err(|e| { - tracing::debug!("Failed to send blocked connection notification: {:?}", e) + .inspect_err(|error| { + tracing::debug!( + remote = %peer.addr, + ?error, + "Failed to notify caller about blocked connection" + ); }) .ok(); return Ok(()); } - tracing::debug!(tx = %tx, "Blocked addresses: {:?}, peer addr: {}", blocked_addrs, peer.addr); + tracing::debug!( + tx = %tx, + "Blocked addresses: {:?}, peer addr: {}", + blocked_addrs, + peer.addr + ); } - let peer_addr = peer.addr; + match state.awaiting_connection.entry(peer_addr) { std::collections::hash_map::Entry::Occupied(mut callbacks) => { let txs_entry = state.awaiting_connection_txs.entry(peer_addr).or_default(); @@ -1093,6 +1228,7 @@ impl P2pConnManager { tx = %tx, remote = %peer_addr, pending = callbacks.get().len(), + courtesy, "Connection already pending, queuing additional requester" ); callbacks.get_mut().push(callback); @@ -1101,6 +1237,7 @@ impl P2pConnManager { remote = %peer_addr, pending = callbacks.get().len(), pending_txs = ?txs_entry, + courtesy, "connect_peer: connection already pending, queued callback" ); return Ok(()); @@ -1109,167 +1246,83 @@ impl P2pConnManager { let txs_entry = state.awaiting_connection_txs.entry(peer_addr).or_default(); txs_entry.push(tx); tracing::debug!( - tx = %tx, + tx = %tx, remote = %peer_addr, + courtesy, "connect_peer: registering new pending connection" ); entry.insert(vec![callback]); tracing::info!( - tx = %tx, - remote = %peer_addr, - pending = 1, + tx = %tx, + remote = %peer_addr, + pending = 1, pending_txs = ?txs_entry, + courtesy, "connect_peer: registered new pending connection" ); state.outbound_handler.expect_incoming(peer_addr); } } - tracing::debug!( - tx = %tx, - remote = %peer.addr, - "connect_peer: dispatching establish_conn" - ); - let res = timeout( - Duration::from_secs(10), - handshake_handler_msg.establish_conn(peer.clone(), tx, is_gw), - ) - .await; - match res { - Ok(Ok(())) => { + + if let Err(error) = handshake_commands + .send(HandshakeCommand::Connect { + peer: peer.clone(), + transaction: tx, + courtesy, + }) + .await + { + tracing::warn!( + tx = %tx, + remote = %peer.addr, + courtesy, + ?error, + "Failed to enqueue connect command" + ); + self.bridge + .op_manager + .ring + .connection_manager + .prune_in_transit_connection(&peer); + let pending_txs = state.awaiting_connection_txs.remove(&peer_addr); + if let Some(callbacks) = state.awaiting_connection.remove(&peer_addr) { tracing::debug!( tx = %tx, - "Successfully initiated connection process for peer: {:?}", - peer - ); - if let Some(callbacks) = state.awaiting_connection.get(&peer.addr) { - tracing::debug!( - tx = %tx, - remote = %peer_addr, - pending_callbacks = callbacks.len(), - "connect_peer: handshake in flight" - ); - let pending_txs = state - .awaiting_connection_txs - .get(&peer_addr) - .cloned() - .unwrap_or_default(); - tracing::info!( - tx = %tx, - remote = %peer_addr, - pending_callbacks = callbacks.len(), - pending_txs = ?pending_txs, - "connect_peer: handshake initiated - awaiting completion" - ); - } - Ok(()) - } - Ok(Err(e)) => { - tracing::warn!( - tx = %tx, - remote = %peer.addr, - error = ?e, - "Handshake establish_conn returned error" + remote = %peer_addr, + callbacks = callbacks.len(), + courtesy, + "Cleaning up callbacks after connect command failure" ); - let pending_txs = state - .awaiting_connection_txs - .remove(&peer.addr) - .unwrap_or_default(); - if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { - tracing::debug!( - tx = %tx, - remote = %peer.addr, - callbacks = callbacks.len(), - "Handshake establish_conn returned error - notifying callbacks" - ); - tracing::info!( - tx = %tx, - remote = %peer.addr, - callbacks = callbacks.len(), - pending_txs = ?pending_txs, - "connect_peer: handshake errored - notifying callbacks" - ); - let mut callbacks = callbacks.into_iter(); - if let Some(mut cb) = callbacks.next() { - cb.send_result(Err(e)) - .await - .inspect_err(|send_err| { - tracing::debug!( - remote = %peer.addr, - error = ?send_err, - "Failed to deliver handshake error to awaiting callback" - ); - }) - .ok(); - } - for mut cb in callbacks { - cb.send_result(Err(HandshakeError::ConnectionClosed(peer.addr))) - .await - .inspect_err(|send_err| { - tracing::debug!( - remote = %peer.addr, - error = ?send_err, - "Failed to deliver fallback handshake error to awaiting callback" - ); - }) - .ok(); - } + for mut cb in callbacks { + cb.send_result(Err(())) + .await + .inspect_err(|send_err| { + tracing::debug!( + remote = %peer_addr, + ?send_err, + "Failed to deliver connect command failure to awaiting callback" + ); + }) + .ok(); } - Ok(()) } - Err(elapsed) => { - tracing::warn!( - tx = %tx, - remote = %peer.addr, - elapsed = ?elapsed, - "Timeout while establishing connection" + if let Some(pending_txs) = pending_txs { + tracing::debug!( + remote = %peer_addr, + pending_txs = ?pending_txs, + "Removed pending transactions after connect command failure" ); - let pending_txs = state - .awaiting_connection_txs - .remove(&peer.addr) - .unwrap_or_default(); - if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { - tracing::debug!( - tx = %tx, - remote = %peer.addr, - callbacks = callbacks.len(), - "Handshake timed out - notifying callbacks" - ); - tracing::info!( - tx = %tx, - remote = %peer.addr, - callbacks = callbacks.len(), - pending_txs = ?pending_txs, - "connect_peer: handshake timed out - notifying callbacks" - ); - let mut iter = callbacks.into_iter(); - if let Some(mut cb) = iter.next() { - cb.send_result(Err(HandshakeError::ConnectionClosed(peer.addr))) - .await - .inspect_err(|send_err| { - tracing::debug!( - remote = %peer.addr, - error = ?send_err, - "Failed to deliver connection timeout to awaiting callback" - ); - }) - .ok(); - } - for mut cb in iter { - cb.send_result(Err(HandshakeError::ChannelClosed)) - .await - .inspect_err(|send_err| { - tracing::debug!( - remote = %peer.addr, - error = ?send_err, - "Failed to deliver fallback connection timeout to awaiting callback" - ); - }) - .ok(); - } - } - Ok(()) } + } else { + tracing::debug!( + tx = %tx, + remote = %peer_addr, + courtesy, + "connect_peer: handshake command dispatched" + ); } + + Ok(()) } async fn handle_handshake_action( @@ -1277,310 +1330,176 @@ impl P2pConnManager { event: HandshakeEvent, state: &mut EventListenerState, select_stream: &mut priority_select::ProductionPrioritySelectStream, - _handshake_handler_msg: &HanshakeHandlerMsg, // Parameter added ) -> anyhow::Result<()> { + tracing::info!(?event, "handle_handshake_action: received handshake event"); match event { HandshakeEvent::InboundConnection { - id, - conn, - joiner, - op, - forward_info, - is_bootstrap, + transaction, + peer, + connection, + courtesy, } => { + let remote_addr = connection.remote_addr(); + if let Some(blocked_addrs) = &self.blocked_addresses { - if blocked_addrs.contains(&joiner.addr) { - tracing::info!(%id, remote = %joiner.addr, "Inbound connection from peer blocked by local policy"); - // Not proceeding with adding connection or processing the operation. - // Don't call drop_connection_by_addr as it can cause channels to close abruptly - // Just ignore the connection and let it timeout naturally + if blocked_addrs.contains(&remote_addr) { + tracing::info!( + remote = %remote_addr, + courtesy, + transaction = ?transaction, + "Inbound connection blocked by local policy" + ); return Ok(()); } } - // Only insert if connection doesn't already exist to avoid dropping existing channel - if !self.connections.contains_key(&joiner) { - let (tx, rx) = mpsc::channel(1); - tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %joiner, %id, conn_map_size = self.connections.len(), "[CONN_TRACK] INSERT: InboundConnection - adding to connections HashMap"); - self.connections.insert(joiner.clone(), tx); - let task = peer_connection_listener(rx, conn).boxed(); - select_stream.push_peer_connection(task); - } else { - tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %joiner, %id, conn_map_size = self.connections.len(), "[CONN_TRACK] SKIP INSERT: InboundConnection - connection already exists in HashMap, dropping new connection"); - // Connection already exists - drop the new connection object but continue processing the operation - // The conn will be dropped here which closes the duplicate connection attempt - } - // IMPORTANT: Normally we do NOT add connection to ring here! - // Connection should only be added after StartJoinReq is accepted - // via CheckConnectivity. This prevents the "already connected" bug - // where gateways reject valid join requests. - // - // EXCEPTION: Gateway bootstrap (is_bootstrap=true) - // When a gateway accepts its very first connection (bootstrap case), - // we must register it immediately so the gateway can respond to - // FindOptimalPeer requests from subsequent joiners. Bootstrap connections - // bypass the normal CheckConnectivity flow. See forward_conn() in - // connect.rs and PR #1871 for full explanation. - if is_bootstrap { - let location = Location::from_address(&joiner.addr); + let peer_id = peer.unwrap_or_else(|| { tracing::info!( - %id, - %joiner, - %location, - "Bootstrap connection: immediately registering in ring" + remote = %remote_addr, + courtesy, + transaction = ?transaction, + "Inbound connection arrived without matching expectation; accepting provisionally" ); - self.bridge - .op_manager - .ring - .add_connection(location, joiner.clone(), true) - .await; - } + PeerId::new( + remote_addr, + (*self + .bridge + .op_manager + .ring + .connection_manager + .pub_key) + .clone(), + ) + }); - if let Some(op) = op { - self.bridge - .op_manager - .push(id, crate::operations::OpEnum::Connect(op)) - .await?; - } + tracing::info!( + remote = %peer_id.addr, + courtesy, + transaction = ?transaction, + "Inbound connection established" + ); - if let Some(ForwardInfo { - target: forward_to, - msg, - }) = forward_info.map(|b| *b) - { - self.try_to_forward(&forward_to, msg).await?; - } - } - HandshakeEvent::TransientForwardTransaction { - target, - tx, - forward_to, - msg, - } => { - if let Some(older_addr) = state.transient_conn.insert(tx, target) { - debug_assert_eq!(older_addr, target); - tracing::warn!(%target, %forward_to, "Transaction {} already exists as transient connections", tx); - if older_addr != target { - tracing::error!( - %tx, - "Not same target in new and old transient connections: {} != {}", - older_addr, target - ); - } - } - self.try_to_forward(&forward_to, *msg).await?; - } - HandshakeEvent::OutboundConnectionSuccessful { - peer_id, - connection, - } => { self.handle_successful_connection(peer_id, connection, state, select_stream, None) .await?; } - HandshakeEvent::OutboundGatewayConnectionSuccessful { - peer_id, + HandshakeEvent::OutboundEstablished { + transaction, + peer, connection, - remaining_checks, + courtesy, } => { - self.handle_successful_connection( - peer_id, - connection, - state, - select_stream, - Some(remaining_checks), - ) - .await?; - } - HandshakeEvent::OutboundConnectionFailed { peer_id, error } => { - tracing::info!(%peer_id, "Connection failed: {:?}", error); - if self.check_version { - if let HandshakeError::TransportError( - TransportError::ProtocolVersionMismatch { .. }, - ) = &error - { - // The TransportError already has a user-friendly error message - // Just propagate it without additional logging to avoid duplication - return Err(error.into()); - } - } - let pending_txs = state - .awaiting_connection_txs - .remove(&peer_id.addr) - .unwrap_or_default(); - if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { - tracing::debug!( - %peer_id, - callbacks = callbacks.len(), - "HandshakeEvent::OutboundConnectionFailed - notifying callbacks" - ); - tracing::info!( - %peer_id, - callbacks = callbacks.len(), - pending_txs = ?pending_txs, - "HandshakeEvent::OutboundConnectionFailed - notifying callbacks" - ); - let mut callbacks = callbacks.into_iter(); - if let Some(mut r) = callbacks.next() { - // Don't propagate channel closed errors - just log and continue - // The receiver may have timed out or been cancelled, which shouldn't crash the node - r.send_result(Err(error)) - .await - .inspect_err(|e| { - tracing::warn!(%peer_id, "Failed to send connection error notification - receiver may have timed out: {:?}", e); - }) - .ok(); - } - for mut r in callbacks { - if let Err(e) = r.send_result(Err(HandshakeError::ChannelClosed)).await { - tracing::debug!(%peer_id, "Failed to send fallback connection error notification: {:?}", e); - } - } - } - } - HandshakeEvent::RemoveTransaction(tx) => { - state.transient_conn.remove(&tx); - } - HandshakeEvent::OutboundGatewayConnectionRejected { peer_id } => { - tracing::info!(%peer_id, "Connection rejected by peer"); - let pending_txs = state - .awaiting_connection_txs - .remove(&peer_id.addr) - .unwrap_or_default(); - if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { - tracing::debug!( - %peer_id, - callbacks = callbacks.len(), - "HandshakeEvent::OutboundGatewayConnectionRejected - notifying callbacks" - ); - tracing::info!( - %peer_id, - callbacks = callbacks.len(), - pending_txs = ?pending_txs, - "HandshakeEvent::OutboundGatewayConnectionRejected - notifying callbacks" - ); - for mut r in callbacks { - // Don't propagate channel closed errors - just log and continue - if let Err(e) = r.send_result(Err(HandshakeError::ChannelClosed)).await { - tracing::debug!(%peer_id, "Failed to send rejection notification: {:?}", e); - } - } - } - } - HandshakeEvent::InboundConnectionRejected { peer_id } => { - tracing::debug!(%peer_id, "Inbound connection rejected"); + tracing::info!( + remote = %peer.addr, + courtesy, + transaction = %transaction, + "Outbound connection established" + ); + self.handle_successful_connection(peer, connection, state, select_stream, None) + .await?; } - } - Ok(()) - } + HandshakeEvent::OutboundFailed { + transaction, + peer, + error, + courtesy, + } => { + tracing::info!( + remote = %peer.addr, + courtesy, + transaction = %transaction, + ?error, + "Outbound connection failed" + ); + + self.bridge + .op_manager + .ring + .connection_manager + .prune_in_transit_connection(&peer); - async fn handle_handshake_failure( - &mut self, - handshake_error: HandshakeError, - state: &mut EventListenerState, - ) -> anyhow::Result<()> { - match handshake_error { - HandshakeError::ConnectionClosed(addr) => { let pending_txs = state .awaiting_connection_txs - .remove(&addr) + .remove(&peer.addr) .unwrap_or_default(); - if let Some(callbacks) = state.awaiting_connection.remove(&addr) { - tracing::info!( - remote = %addr, + + if let Some(callbacks) = state.awaiting_connection.remove(&peer.addr) { + tracing::debug!( + remote = %peer.addr, callbacks = callbacks.len(), pending_txs = ?pending_txs, - "Notifying callbacks after handshake connection closed" + courtesy, + "Notifying callbacks after outbound failure" ); let mut callbacks = callbacks.into_iter(); if let Some(mut cb) = callbacks.next() { - cb.send_result(Err(HandshakeError::ConnectionClosed(addr))) + cb.send_result(Err(())) .await .inspect_err(|err| { tracing::debug!( - remote = %addr, - error = ?err, - "Failed to notify primary handshake callback" - ); - }) - .ok(); - } - - for mut cb in callbacks { - cb.send_result(Err(HandshakeError::ChannelClosed)) - .await - .inspect_err(|err| { - tracing::debug!( - remote = %addr, - error = ?err, - "Failed to notify fallback handshake callback" + remote = %peer.addr, + ?err, + "Failed to deliver outbound failure notification" ); }) .ok(); } - } - - // Drop any pending transient transactions bound to this address - state - .transient_conn - .retain(|_, socket_addr| socket_addr != &addr); - } - HandshakeError::ChannelClosed => { - if !state.awaiting_connection.is_empty() { - tracing::warn!( - awaiting = state.awaiting_connection.len(), - "Handshake channel closed; notifying all pending callbacks" - ); - } - - let awaiting = std::mem::take(&mut state.awaiting_connection); - let awaiting_txs = std::mem::take(&mut state.awaiting_connection_txs); - - for (addr, callbacks) in awaiting { - let pending_txs = awaiting_txs.get(&addr).cloned().unwrap_or_default(); - tracing::debug!( - remote = %addr, - callbacks = callbacks.len(), - pending_txs = ?pending_txs, - "Delivering channel-closed notification to pending callbacks" - ); for mut cb in callbacks { - cb.send_result(Err(HandshakeError::ChannelClosed)) + cb.send_result(Err(())) .await .inspect_err(|err| { tracing::debug!( - remote = %addr, - error = ?err, - "Failed to deliver channel-closed handshake notification" + remote = %peer.addr, + ?err, + "Failed to deliver secondary outbound failure notification" ); }) .ok(); } } } - other => { - tracing::warn!( - ?other, - "Unhandled handshake error without socket association" - ); - } } - Ok(()) } - async fn try_to_forward(&mut self, forward_to: &PeerId, msg: NetMessage) -> anyhow::Result<()> { - if let Some(peer) = self.connections.get(forward_to) { - tracing::debug!(%forward_to, %msg, "Forwarding message to peer"); - // TODO: review: this could potentially leave garbage tasks in the background with peer listener - timeout(Duration::from_secs(1), peer.send(Left(msg))) - .await - .inspect_err(|error| { - tracing::error!("Failed to forward message to peer: {:?}", error); - })??; - } else { - tracing::warn!(%forward_to, "No connection to forward the message"); + async fn handle_handshake_stream_closed( + &mut self, + state: &mut EventListenerState, + ) -> anyhow::Result<()> { + if state.awaiting_connection.is_empty() { + return Ok(()); + } + + tracing::warn!( + awaiting = state.awaiting_connection.len(), + "Handshake driver closed; notifying pending callbacks" + ); + + let awaiting = std::mem::take(&mut state.awaiting_connection); + let awaiting_txs = std::mem::take(&mut state.awaiting_connection_txs); + + for (addr, callbacks) in awaiting { + let pending_txs = awaiting_txs.get(&addr).cloned().unwrap_or_default(); + tracing::debug!( + remote = %addr, + callbacks = callbacks.len(), + pending_txs = ?pending_txs, + "Delivering handshake driver shutdown notification" + ); + for mut cb in callbacks { + cb.send_result(Err(())) + .await + .inspect_err(|err| { + tracing::debug!( + remote = %addr, + ?err, + "Failed to deliver handshake driver shutdown notification" + ); + }) + .ok(); + } } + Ok(()) } @@ -1597,20 +1516,17 @@ impl P2pConnManager { .remove(&peer_id.addr) .unwrap_or_default(); if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { - let resolved_peer_id = if let Some(peer_id) = self - .bridge - .op_manager - .ring - .connection_manager - .get_peer_key() - { + let connection_manager = &self.bridge.op_manager.ring.connection_manager; + let resolved_peer_id = if let Some(peer_id) = connection_manager.get_peer_key() { peer_id } else { let self_addr = connection .my_address() .ok_or_else(|| anyhow::anyhow!("self addr should be set"))?; - let key = (*self.bridge.op_manager.ring.connection_manager.pub_key).clone(); - PeerId::new(self_addr, key) + connection_manager.try_set_peer_key(self_addr); + connection_manager + .get_peer_key() + .expect("peer key should be set after try_set_peer_key") }; tracing::debug!( remote = %peer_id.addr, @@ -1625,14 +1541,27 @@ impl P2pConnManager { "handle_successful_connection: connection established" ); for mut cb in callbacks { - timeout( + match timeout( Duration::from_secs(60), cb.send_result(Ok((resolved_peer_id.clone(), remaining_checks))), ) .await - .inspect_err(|error| { - tracing::error!("Failed to send connection result: {:?}", error); - })??; + { + Ok(Ok(())) => {} + Ok(Err(())) => { + tracing::debug!( + remote = %peer_id.addr, + "Callback dropped before receiving connection result" + ); + } + Err(error) => { + tracing::error!( + remote = %peer_id.addr, + ?error, + "Failed to deliver connection result" + ); + } + } } } else { tracing::warn!( @@ -1643,15 +1572,32 @@ impl P2pConnManager { } // Only insert if connection doesn't already exist to avoid dropping existing channel + let mut newly_inserted = false; if !self.connections.contains_key(&peer_id) { let (tx, rx) = mpsc::channel(10); tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %peer_id, conn_map_size = self.connections.len(), "[CONN_TRACK] INSERT: OutboundConnectionSuccessful - adding to connections HashMap"); self.connections.insert(peer_id.clone(), tx); let task = peer_connection_listener(rx, connection).boxed(); select_stream.push_peer_connection(task); + newly_inserted = true; } else { tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %peer_id, conn_map_size = self.connections.len(), "[CONN_TRACK] SKIP INSERT: OutboundConnectionSuccessful - connection already exists in HashMap"); } + + if newly_inserted { + let pending_loc = self + .bridge + .op_manager + .ring + .connection_manager + .prune_in_transit_connection(&peer_id); + let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); + self.bridge + .op_manager + .ring + .add_connection(loc, peer_id.clone(), false) + .await; + } Ok(()) } @@ -1660,13 +1606,54 @@ impl P2pConnManager { msg: Option>, state: &mut EventListenerState, select_stream: &mut priority_select::ProductionPrioritySelectStream, - handshake_handler_msg: &HanshakeHandlerMsg, + handshake_commands: &HandshakeCommandSender, ) -> anyhow::Result { match msg { Some(Ok(peer_conn)) => { + let mut peer_conn = peer_conn; // Get the remote address from the connection let remote_addr = peer_conn.conn.remote_addr(); + if let Some(sender_peer) = extract_sender_from_message(&peer_conn.msg) { + if sender_peer.peer.addr == remote_addr + || sender_peer.peer.addr.ip().is_unspecified() + { + let mut new_peer_id = sender_peer.peer.clone(); + if new_peer_id.addr.ip().is_unspecified() { + new_peer_id.addr = remote_addr; + if let Some(sender_mut) = + extract_sender_from_message_mut(&mut peer_conn.msg) + { + if sender_mut.peer.addr.ip().is_unspecified() { + sender_mut.peer.addr = remote_addr; + } + } + } + if let Some(existing_key) = self + .connections + .keys() + .find(|peer| { + peer.addr == remote_addr && peer.pub_key != new_peer_id.pub_key + }) + .cloned() + { + if let Some(channel) = self.connections.remove(&existing_key) { + tracing::info!( + remote = %remote_addr, + old_peer = %existing_key, + new_peer = %new_peer_id, + "Updating provisional peer identity after inbound message" + ); + self.bridge + .op_manager + .ring + .update_connection_identity(&existing_key, new_peer_id.clone()); + self.connections.insert(new_peer_id, channel); + } + } + } + } + // Check if we need to establish a connection back to the sender let should_connect = !self.connections.keys().any(|peer| peer.addr == remote_addr) && !state.awaiting_connection.contains_key(&remote_addr); @@ -1679,7 +1666,7 @@ impl P2pConnManager { sender_peer.peer ); - let tx = Transaction::new::(); + let tx = Transaction::new::(); let (callback, _rx) = tokio::sync::mpsc::channel(10); // Don't await - let it happen in the background @@ -1688,9 +1675,9 @@ impl P2pConnManager { sender_peer.peer.clone(), Box::new(callback), tx, - handshake_handler_msg, + handshake_commands, state, - false, // not a gateway connection + false, // not a courtesy connection ) .await; } @@ -1716,7 +1703,16 @@ impl P2pConnManager { .prune_connection(peer.clone()) .await; self.connections.remove(&peer); - handshake_handler_msg.drop_connection(peer).await?; + if let Err(error) = handshake_commands + .send(HandshakeCommand::DropConnection { peer: peer.clone() }) + .await + { + tracing::warn!( + remote = %socket_addr, + ?error, + "Failed to notify handshake driver about dropped connection" + ); + } } } Ok(EventResult::Continue) @@ -1771,7 +1767,10 @@ impl P2pConnManager { EventResult::Event(ConnEvent::InboundMessage(msg).into()) } Some(Right(action)) => { - tracing::debug!("handle_notification_msg: Received NodeEvent notification"); + tracing::info!( + event = %action, + "handle_notification_msg: Received NodeEvent notification" + ); EventResult::Event(ConnEvent::NodeAction(action).into()) } None => EventResult::Event( @@ -1883,37 +1882,16 @@ impl P2pConnManager { trait ConnectResultSender { fn send_result( &mut self, - result: Result<(PeerId, Option), HandshakeError>, - ) -> Pin> + Send + '_>>; -} - -impl ConnectResultSender for Option>> { - fn send_result( - &mut self, - result: Result<(PeerId, Option), HandshakeError>, - ) -> Pin> + Send + '_>> { - async move { - self.take() - .expect("always set") - .send(result.map(|(id, _)| id)) - .map_err(|_| HandshakeError::ChannelClosed)?; - Ok(()) - } - .boxed() - } + result: Result<(PeerId, Option), ()>, + ) -> Pin> + Send + '_>>; } impl ConnectResultSender for mpsc::Sender), ()>> { fn send_result( &mut self, - result: Result<(PeerId, Option), HandshakeError>, - ) -> Pin> + Send + '_>> { - async move { - self.send(result.map_err(|_| ())) - .await - .map_err(|_| HandshakeError::ChannelClosed) - } - .boxed() + result: Result<(PeerId, Option), ()>, + ) -> Pin> + Send + '_>> { + async move { self.send(result).await.map_err(|_| ()) }.boxed() } } @@ -1924,7 +1902,6 @@ struct EventListenerState { // FIXME: we are potentially leaving trash here when transacrions are completed tx_to_client: HashMap>, client_waiting_transaction: Vec<(WaitingTransaction, HashSet)>, - transient_conn: HashMap, awaiting_connection: HashMap>>, awaiting_connection_txs: HashMap>, pending_op_results: HashMap>, @@ -1937,7 +1914,6 @@ impl EventListenerState { pending_from_executor: HashSet::new(), tx_to_client: HashMap::new(), client_waiting_transaction: Vec::new(), - transient_conn: HashMap::new(), awaiting_connection: HashMap::new(), pending_op_results: HashMap::new(), awaiting_connection_txs: HashMap::new(), @@ -2040,12 +2016,6 @@ fn decode_msg(data: &[u8]) -> Result { fn extract_sender_from_message(msg: &NetMessage) -> Option { match msg { NetMessage::V1(msg_v1) => match msg_v1 { - // Connect messages often have sender information - NetMessageV1::Connect(connect_msg) => match connect_msg { - ConnectMsg::Response { sender, .. } => Some(sender.clone()), - ConnectMsg::Request { target, .. } => Some(target.clone()), - _ => None, - }, NetMessageV1::ConnectV2(connect_msg) => match connect_msg { ConnectMsgV2::Response { sender, .. } => Some(sender.clone()), ConnectMsgV2::Request { from, .. } => Some(from.clone()), @@ -2083,4 +2053,39 @@ fn extract_sender_from_message(msg: &NetMessage) -> Option { } } +fn extract_sender_from_message_mut(msg: &mut NetMessage) -> Option<&mut PeerKeyLocation> { + match msg { + NetMessage::V1(msg_v1) => match msg_v1 { + NetMessageV1::ConnectV2(connect_msg) => match connect_msg { + ConnectMsgV2::Response { sender, .. } => Some(sender), + ConnectMsgV2::Request { from, .. } => Some(from), + ConnectMsgV2::ObservedAddress { target, .. } => Some(target), + }, + NetMessageV1::Get(get_msg) => match get_msg { + GetMsg::SeekNode { sender, .. } => Some(sender), + GetMsg::ReturnGet { sender, .. } => Some(sender), + _ => None, + }, + NetMessageV1::Put(put_msg) => match put_msg { + PutMsg::SeekNode { sender, .. } => Some(sender), + PutMsg::SuccessfulPut { sender, .. } => Some(sender), + PutMsg::PutForward { sender, .. } => Some(sender), + _ => None, + }, + NetMessageV1::Update(update_msg) => match update_msg { + UpdateMsg::SeekNode { sender, .. } => Some(sender), + UpdateMsg::Broadcasting { sender, .. } => Some(sender), + UpdateMsg::BroadcastTo { sender, .. } => Some(sender), + _ => None, + }, + NetMessageV1::Subscribe(subscribe_msg) => match subscribe_msg { + SubscribeMsg::SeekNode { subscriber, .. } => Some(subscriber), + SubscribeMsg::ReturnSub { sender, .. } => Some(sender), + _ => None, + }, + _ => None, + }, + } +} + // TODO: add testing for the network loop, now it should be possible to do since we don't depend upon having real connections diff --git a/crates/core/src/node/network_bridge/priority_select.rs b/crates/core/src/node/network_bridge/priority_select.rs index 68dfc2b65..68e4f2666 100644 --- a/crates/core/src/node/network_bridge/priority_select.rs +++ b/crates/core/src/node/network_bridge/priority_select.rs @@ -15,7 +15,6 @@ use crate::contract::{ }; use crate::dev_tool::{PeerId, Transaction}; use crate::message::{NetMessage, NodeEvent}; -use crate::node::network_bridge::handshake::HandshakeError; use crate::transport::TransportError; // P2pBridgeEvent type alias for the event bridge channel @@ -28,7 +27,7 @@ pub(super) enum SelectResult { OpExecution(Option<(tokio::sync::mpsc::Sender, NetMessage)>), PeerConnection(Option>), ConnBridge(Option), - Handshake(Result), + Handshake(Option), NodeController(Option), ClientTransaction( Result< @@ -90,7 +89,7 @@ impl ExecutorTransactionReceiver for ExecutorToEventLoopChannel, ExecutorToEventLoopChannel, >; @@ -101,7 +100,7 @@ pub(super) type ProductionPrioritySelectStream = PrioritySelectStream< /// alive across loop iterations, maintaining waker registration. pub(super) struct PrioritySelectStream where - H: Stream> + Unpin, + H: Stream + Unpin, C: ClientTransactionRelay, E: ExecutorTransactionReceiver, { @@ -134,7 +133,7 @@ where impl PrioritySelectStream where - H: Stream> + Unpin, + H: Stream + Unpin, C: ClientTransactionRelay, E: ExecutorTransactionReceiver, { @@ -180,7 +179,7 @@ where impl Stream for PrioritySelectStream where - H: Stream> + Unpin, + H: Stream + Unpin, C: ClientTransactionRelay, E: ExecutorTransactionReceiver, { @@ -254,8 +253,14 @@ where // Priority 5: Handshake handler (now implements Stream) // Poll the handshake handler stream - it maintains state across polls match Pin::new(&mut this.handshake_handler).poll_next(cx) { - Poll::Ready(Some(result)) => return Poll::Ready(Some(SelectResult::Handshake(result))), - Poll::Ready(None) => {} // Stream ended (shouldn't happen in practice) + Poll::Ready(Some(event)) => { + return Poll::Ready(Some(SelectResult::Handshake(Some(event)))) + } + Poll::Ready(None) => { + if first_closed_channel.is_none() { + first_closed_channel = Some(SelectResult::Handshake(None)); + } + } Poll::Pending => {} } diff --git a/crates/core/src/node/network_bridge/priority_select/tests.rs b/crates/core/src/node/network_bridge/priority_select/tests.rs index 480049fb2..1b22fa93c 100644 --- a/crates/core/src/node/network_bridge/priority_select/tests.rs +++ b/crates/core/src/node/network_bridge/priority_select/tests.rs @@ -7,7 +7,7 @@ use tokio::time::{sleep, timeout, Duration}; struct MockHandshakeStream; impl Stream for MockHandshakeStream { - type Item = Result; + type Item = crate::node::network_bridge::handshake_v2::Event; fn poll_next(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { Poll::Pending diff --git a/crates/core/src/node/op_state_manager.rs b/crates/core/src/node/op_state_manager.rs index df8b04fa4..3b597177e 100644 --- a/crates/core/src/node/op_state_manager.rs +++ b/crates/core/src/node/op_state_manager.rs @@ -366,6 +366,7 @@ impl OpManager { // Useful when we want to notify connection attempts, or other events that do not require any // network communication with other nodes. pub async fn notify_node_event(&self, msg: NodeEvent) -> Result<(), OpError> { + tracing::info!(event = %msg, "notify_node_event: queuing node event"); self.to_event_listener .notifications_sender .send(Either::Right(msg)) diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index d298aa149..75f39c87b 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -225,16 +225,30 @@ impl RelayState { } if self.forwarded_to.is_none() && self.request.ttl > 0 { - if let Some(next) = - ctx.select_next_hop(self.request.desired_location, &self.request.visited) - { - let mut forward_req = self.request.clone(); - forward_req.ttl = forward_req.ttl.saturating_sub(1); - push_unique_peer(&mut forward_req.visited, ctx.self_location().clone()); - let forward_snapshot = forward_req.clone(); - self.forwarded_to = Some(next.clone()); - self.request = forward_req; - actions.forward = Some((next, forward_snapshot)); + match ctx.select_next_hop(self.request.desired_location, &self.request.visited) { + Some(next) => { + tracing::debug!( + target = %self.request.desired_location, + ttl = self.request.ttl, + next_peer = %next.peer, + "connect_v2: forwarding join request to next hop" + ); + let mut forward_req = self.request.clone(); + forward_req.ttl = forward_req.ttl.saturating_sub(1); + push_unique_peer(&mut forward_req.visited, ctx.self_location().clone()); + let forward_snapshot = forward_req.clone(); + self.forwarded_to = Some(next.clone()); + self.request = forward_req; + actions.forward = Some((next, forward_snapshot)); + } + None => { + tracing::debug!( + target = %self.request.desired_location, + ttl = self.request.ttl, + visited = ?self.request.visited, + "connect_v2: no next hop candidates available" + ); + } } } @@ -300,6 +314,7 @@ pub struct AcceptedPeer { pub struct JoinerAcceptance { pub new_acceptor: Option, pub satisfied: bool, + pub assigned_location: bool, } impl JoinerState { @@ -315,6 +330,7 @@ impl JoinerState { peer: response.acceptor.clone(), courtesy: response.courtesy, }); + acceptance.assigned_location = self.accepted.len() == 1; } acceptance.satisfied = self.accepted.len() >= self.target_connections; acceptance @@ -335,6 +351,7 @@ pub(crate) struct ConnectOpV2 { pub(crate) state: Option, pub(crate) gateway: Option>, pub(crate) backoff: Option, + pub(crate) desired_location: Option, } impl ConnectOpV2 { @@ -359,6 +376,7 @@ impl ConnectOpV2 { state: Some(state), gateway: gateway.map(Box::new), backoff, + desired_location: Some(desired_location), } } @@ -380,6 +398,7 @@ impl ConnectOpV2 { state: Some(state), gateway: None, backoff: None, + desired_location: None, } } @@ -411,6 +430,10 @@ impl ConnectOpV2 { self.gateway.as_deref() } + fn take_desired_location(&mut self) -> Option { + self.desired_location.take() + } + pub(crate) fn initiate_join_request( own: PeerKeyLocation, target: PeerKeyLocation, @@ -608,42 +631,89 @@ impl Operation for ConnectOpV2 { Ok(store_operation_state(&mut self)) } - ConnectMsgV2::Response { payload, .. } => { - if let Some(acceptance) = self.handle_response(payload, Instant::now()) { - if let Some(new_acceptor) = acceptance.new_acceptor { - op_manager - .notify_node_event( - crate::message::NodeEvent::ExpectPeerConnection { + ConnectMsgV2::Response { + sender, payload, .. + } => { + if self.gateway.is_some() { + if let Some(acceptance) = self.handle_response(payload, Instant::now()) { + if acceptance.assigned_location { + if let Some(location) = self.take_desired_location() { + tracing::info!( + tx=%self.id, + assigned_location = %location.0, + "connect_v2: assigning joiner location" + ); + op_manager + .ring + .connection_manager + .update_location(Some(location)); + } + } + + if let Some(new_acceptor) = acceptance.new_acceptor { + op_manager + .notify_node_event( + crate::message::NodeEvent::ExpectPeerConnection { + peer: new_acceptor.peer.peer.clone(), + }, + ) + .await?; + + let (callback, mut rx) = mpsc::channel(1); + op_manager + .notify_node_event(NodeEvent::ConnectPeer { peer: new_acceptor.peer.peer.clone(), - }, - ) - .await?; - - let (callback, mut rx) = mpsc::channel(1); - op_manager - .notify_node_event(NodeEvent::ConnectPeer { - peer: new_acceptor.peer.peer.clone(), - tx: self.id, - callback, - is_gw: new_acceptor.courtesy, - }) - .await?; - - if let Some(result) = rx.recv().await { - if let Ok((peer_id, _remaining)) = result { - tracing::info!(%peer_id, tx=%self.id, "connect_v2 joined peer"); - } else { - tracing::warn!(tx=%self.id, "connect_v2 ConnectPeer failed"); + tx: self.id, + callback, + is_gw: new_acceptor.courtesy, + }) + .await?; + + if let Some(result) = rx.recv().await { + if let Ok((peer_id, _remaining)) = result { + tracing::info!( + %peer_id, + tx=%self.id, + "connect_v2 joined peer" + ); + } else { + tracing::warn!( + tx=%self.id, + "connect_v2 ConnectPeer failed" + ); + } } } - } - if acceptance.satisfied { - self.state = Some(ConnectState::Completed); + if acceptance.satisfied { + self.state = Some(ConnectState::Completed); + } } - } - Ok(store_operation_state(&mut self)) + Ok(store_operation_state(&mut self)) + } else if let Some(ConnectState::Relaying(state)) = self.state.as_mut() { + let upstream = state.upstream.clone(); + tracing::debug!( + %upstream.peer, + acceptor = %sender.peer, + "connect_v2: forwarding response towards joiner" + ); + let forward_msg = ConnectMsgV2::Response { + id: self.id, + sender: sender.clone(), + target: upstream.clone(), + payload: payload.clone(), + }; + network_bridge + .send( + &upstream.peer, + NetMessage::V1(NetMessageV1::ConnectV2(forward_msg)), + ) + .await?; + Ok(store_operation_state(&mut self)) + } else { + Ok(store_operation_state(&mut self)) + } } ConnectMsgV2::ObservedAddress { address, .. } => { self.handle_observed_address(*address, Instant::now()); @@ -694,6 +764,7 @@ fn store_operation_state_with_msg( state: Some(state), gateway: op.gateway.clone(), backoff: op.backoff.clone(), + desired_location: op.desired_location.clone(), })) }), } diff --git a/crates/core/src/operations/get.rs b/crates/core/src/operations/get.rs index 7f702d139..1963e87b3 100644 --- a/crates/core/src/operations/get.rs +++ b/crates/core/src/operations/get.rs @@ -482,7 +482,7 @@ impl Operation for GetOp { }) .await; - match get_result { + let local_value = match get_result { Ok(ContractHandlerEvent::GetResponse { response: Ok(StoreResponse { @@ -491,68 +491,86 @@ impl Operation for GetOp { }), .. }) => { - // Contract found locally! - tracing::info!( - tx = %id, - %key, - fetch_contract = *fetch_contract, - "GET: contract found locally in RequestGet handler" - ); + if *fetch_contract && contract.is_none() { + tracing::debug!( + tx = %id, + %key, + "GET: state available locally but contract code missing; continuing search" + ); + None + } else { + Some((state, contract)) + } + } + _ => None, + }; - // Check if this is a forwarded request or a local request - match &self.state { - Some(GetState::ReceivedRequest { requester }) - if requester.is_some() => - { - // This is a forwarded request - send result back to requester - let requester = requester.clone().unwrap(); - tracing::debug!(tx = %id, "Returning contract {} to requester {}", key, requester.peer); - new_state = None; - return_msg = Some(GetMsg::ReturnGet { - id: *id, - key: *key, - value: StoreResponse { - state: Some(state), - contract, - }, - sender: target.clone(), - target: requester, - skip_list: skip_list.clone(), - }); - } - _ => { - // This is the original requester (locally initiated request) - new_state = Some(GetState::Finished { key: *key }); - return_msg = None; - result = Some(GetResult { - key: *key, - state, + if let Some((state, contract)) = local_value { + // Contract found locally! + tracing::info!( + tx = %id, + %key, + fetch_contract = *fetch_contract, + "GET: contract found locally in RequestGet handler" + ); + + // Check if this is a forwarded request or a local request + match &self.state { + Some(GetState::ReceivedRequest { requester }) + if requester.is_some() => + { + // This is a forwarded request - send result back to requester + let requester = requester.clone().unwrap(); + tracing::debug!(tx = %id, "Returning contract {} to requester {}", key, requester.peer); + new_state = None; + return_msg = Some(GetMsg::ReturnGet { + id: *id, + key: *key, + value: StoreResponse { + state: Some(state), contract, - }); - } + }, + sender: target.clone(), + target: requester, + skip_list: skip_list.clone(), + }); + } + _ => { + // This is the original requester (locally initiated request) + new_state = Some(GetState::Finished { key: *key }); + return_msg = None; + result = Some(GetResult { + key: *key, + state, + contract, + }); } } - _ => { - // Contract not found locally, proceed with forwarding - tracing::debug!(tx = %id, %key, "Contract not found locally, forwarding to {}", target.peer); + } else { + // Contract not found locally (or missing code), proceed with forwarding + tracing::debug!( + tx = %id, + %key, + "Contract not found locally (or missing code), forwarding to {}", + target.peer + ); - // Prepare skip list with own peer ID - let own_loc = op_manager.ring.connection_manager.own_location(); - let mut new_skip_list = skip_list.clone(); - new_skip_list.insert(own_loc.peer.clone()); - - // Forward using standard routing helper - return try_forward_or_return( - *id, - *key, - (op_manager.ring.max_hops_to_live.max(1), *fetch_contract), - (target.clone(), sender.clone()), - new_skip_list, - op_manager, - stats, - ) - .await; - } + // Prepare skip list with own peer ID + let own_loc = op_manager.ring.connection_manager.own_location(); + let mut new_skip_list = skip_list.clone(); + new_skip_list.insert(own_loc.peer.clone()); + + // Forward using standard routing helper + return try_forward_or_return( + *id, + *key, + (op_manager.ring.max_hops_to_live.max(1), *fetch_contract), + (target.clone(), sender.clone()), + new_skip_list, + op_manager, + stats, + ) + .await; } } } @@ -616,46 +634,38 @@ impl Operation for GetOp { .await; // Process get result - match get_result { + let local_value = match get_result { Ok(ContractHandlerEvent::GetResponse { - key, response: Ok(StoreResponse { state: Some(state), contract, }), + .. }) => { - tracing::debug!(tx = %id, "Contract {key} found @ peer {}", target.peer); - - match self.state { - Some(GetState::AwaitingResponse { requester, .. }) => { - if let Some(requester) = requester { - // Forward contract to requester - new_state = None; - tracing::debug!(tx = %id, "Returning contract {} to {}", key, sender.peer); - return_msg = Some(GetMsg::ReturnGet { - id, - key, - value: StoreResponse { - state: Some(state), - contract, - }, - sender: target.clone(), - target: requester, - skip_list: skip_list.clone(), - }); - } else { - // Operation completed for original requester - tracing::debug!( - tx = %id, - "Completed operation, get response received for contract {key}" - ); - new_state = None; - return_msg = None; - } - } - Some(GetState::ReceivedRequest { .. }) => { - // Return contract to sender + if fetch_contract && contract.is_none() { + tracing::debug!( + tx = %id, + %key, + %this_peer, + "Contract state available but code missing @ peer {}, retrying", + sender.peer + ); + None + } else { + Some((state, contract)) + } + } + _ => None, + }; + + if let Some((state, contract)) = local_value { + tracing::debug!(tx = %id, "Contract {key} found @ peer {}", target.peer); + + match self.state { + Some(GetState::AwaitingResponse { requester, .. }) => { + if let Some(requester) = requester { + // Forward contract to requester new_state = None; tracing::debug!(tx = %id, "Returning contract {} to {}", key, sender.peer); return_msg = Some(GetMsg::ReturnGet { @@ -666,33 +676,56 @@ impl Operation for GetOp { contract, }, sender: target.clone(), - target: sender.clone(), + target: requester, skip_list: skip_list.clone(), }); + } else { + // Operation completed for original requester + tracing::debug!( + tx = %id, + "Completed operation, get response received for contract {key}" + ); + new_state = None; + return_msg = None; } - _ => return Err(OpError::invalid_transition(self.id)), } + Some(GetState::ReceivedRequest { .. }) => { + // Return contract to sender + new_state = None; + tracing::debug!(tx = %id, "Returning contract {} to {}", key, sender.peer); + return_msg = Some(GetMsg::ReturnGet { + id, + key, + value: StoreResponse { + state: Some(state), + contract, + }, + sender: target.clone(), + target: sender.clone(), + skip_list: skip_list.clone(), + }); + } + _ => return Err(OpError::invalid_transition(self.id)), } - _ => { - // Contract not found locally, try forwarding to other peers - tracing::debug!( - tx = %id, - %key, - %this_peer, - "Contract not found @ peer {}, retrying with other peers", - sender.peer - ); - return try_forward_or_return( - id, - key, - (htl, fetch_contract), - (this_peer, sender.clone()), - new_skip_list, - op_manager, - stats, - ) - .await; - } + } else { + // Contract not found locally, try forwarding to other peers + tracing::debug!( + tx = %id, + %key, + %this_peer, + "Contract not found @ peer {}, retrying with other peers", + sender.peer + ); + return try_forward_or_return( + id, + key, + (htl, fetch_contract), + (this_peer, sender.clone()), + new_skip_list, + op_manager, + stats, + ) + .await; } } GetMsg::ReturnGet { diff --git a/crates/core/src/operations/subscribe.rs b/crates/core/src/operations/subscribe.rs index c3047ae1d..ea501ae85 100644 --- a/crates/core/src/operations/subscribe.rs +++ b/crates/core/src/operations/subscribe.rs @@ -16,8 +16,11 @@ use freenet_stdlib::{ prelude::*, }; use serde::{Deserialize, Serialize}; +use tokio::time::{sleep, Duration}; const MAX_RETRIES: usize = 10; +const LOCAL_FETCH_TIMEOUT_MS: u64 = 1_500; +const LOCAL_FETCH_POLL_INTERVAL_MS: u64 = 25; fn subscribers_snapshot(op_manager: &OpManager, key: &ContractKey) -> Vec { op_manager @@ -31,6 +34,22 @@ fn subscribers_snapshot(op_manager: &OpManager, key: &ContractKey) -> Vec Result { + let mut elapsed = 0; + while elapsed < LOCAL_FETCH_TIMEOUT_MS { + if super::has_contract(op_manager, key).await? { + return Ok(true); + } + sleep(Duration::from_millis(LOCAL_FETCH_POLL_INTERVAL_MS)).await; + elapsed += LOCAL_FETCH_POLL_INTERVAL_MS; + } + Ok(false) +} + #[derive(Debug)] enum SubscribeState { /// Prepare the request to subscribe. @@ -520,7 +539,13 @@ impl Operation for SubscribeOp { return Ok(return_not_subbed()); } - if !super::has_contract(op_manager, *key).await? { + if wait_for_local_contract(op_manager, *key).await? { + tracing::info!( + tx = %id, + %key, + "Fetched contract locally while handling subscribe" + ); + } else { tracing::warn!( tx = %id, %key, diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 1e1e8858a..242e12972 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -229,6 +229,16 @@ impl ConnectionManager { ); accepted }; + tracing::info!( + %peer_id, + accepted, + total_conn, + open_connections = open, + reserved_connections = self + .reserved_connections + .load(std::sync::atomic::Ordering::SeqCst), + "should_accept: final decision" + ); if !accepted { self.reserved_connections .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); @@ -357,6 +367,50 @@ impl ConnectionManager { std::mem::drop(lop); } + pub fn update_peer_identity(&self, old_peer: &PeerId, new_peer: PeerId) -> bool { + if old_peer == &new_peer { + tracing::debug!(%old_peer, "update_peer_identity: identical peers; skipping"); + return false; + } + + let mut loc_for_peer = self.location_for_peer.write(); + let Some(loc) = loc_for_peer.remove(old_peer) else { + tracing::debug!( + %old_peer, + %new_peer, + "update_peer_identity: old peer entry not found" + ); + return false; + }; + + tracing::info!(%old_peer, %new_peer, %loc, "Updating peer identity for active connection"); + loc_for_peer.insert(new_peer.clone(), loc); + drop(loc_for_peer); + + let mut cbl = self.connections_by_location.write(); + let entry = cbl.entry(loc).or_default(); + if let Some(conn) = entry + .iter_mut() + .find(|conn| conn.location.peer == *old_peer) + { + conn.location.peer = new_peer; + } else { + tracing::warn!( + %old_peer, + "update_peer_identity: connection entry missing; creating placeholder" + ); + entry.push(Connection { + location: PeerKeyLocation { + peer: new_peer, + location: Some(loc), + }, + open_at: Instant::now(), + }); + } + + true + } + fn prune_connection(&self, peer: &PeerId, is_alive: bool) -> Option { let connection_type = if is_alive { "active" } else { "in transit" }; tracing::debug!(%peer, "Pruning {} connection", connection_type); diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index 3a8aae2ed..22f41f64a 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -191,22 +191,28 @@ impl Ring { /// Return if a contract is within appropiate seeding distance. pub fn should_seed(&self, key: &ContractKey) -> bool { - let own_loc = self - .connection_manager - .own_location() - .location - .expect("should be set"); - self.seeding_manager.should_seed(key, own_loc) + match self.connection_manager.own_location().location { + Some(own_loc) => self.seeding_manager.should_seed(key, own_loc), + None => { + tracing::debug!( + "should_seed: own location not yet available; deferring seeding decision" + ); + false + } + } } /// Add a new subscription for this peer. pub fn seed_contract(&self, key: ContractKey) -> (Option, Vec) { - let own_loc = self - .connection_manager - .own_location() - .location - .expect("should be set"); - self.seeding_manager.seed_contract(key, own_loc) + match self.connection_manager.own_location().location { + Some(own_loc) => self.seeding_manager.seed_contract(key, own_loc), + None => { + tracing::debug!( + "seed_contract: own location not yet available; skipping seeding for now" + ); + (None, Vec::new()) + } + } } /// Whether this node already is seeding to this contract or not. @@ -237,6 +243,15 @@ impl Ring { self.refresh_density_request_cache() } + pub fn update_connection_identity(&self, old_peer: &PeerId, new_peer: PeerId) { + if self + .connection_manager + .update_peer_identity(old_peer, new_peer) + { + self.refresh_density_request_cache(); + } + } + fn refresh_density_request_cache(&self) { let cbl = self.connection_manager.get_connections_by_location(); let topology_manager = &mut self.connection_manager.topology_manager.write(); From 5bd717f692e515f120c349e0228fe8d9cc6dc037 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Fri, 7 Nov 2025 00:45:10 +0100 Subject: [PATCH 26/26] refactor(core): drop legacy connect handshake and clean warnings --- crates/core/src/message.rs | 8 - crates/core/src/node/mod.rs | 1 - crates/core/src/node/network_bridge.rs | 1 - .../core/src/node/network_bridge/handshake.rs | 1569 ----------------- .../node/network_bridge/handshake/tests.rs | 651 ------- .../src/node/network_bridge/p2p_protoc.rs | 9 - crates/core/src/node/op_state_manager.rs | 22 +- crates/core/src/node/testing_impl.rs | 5 - crates/core/src/operations/connect.rs | 1144 +----------- crates/core/src/operations/connect_v2.rs | 2 +- crates/core/src/operations/mod.rs | 2 - crates/core/src/ring/connection.rs | 7 - crates/core/src/ring/connection_manager.rs | 65 +- crates/core/src/ring/live_tx.rs | 27 +- crates/core/src/ring/mod.rs | 77 +- .../core/src/transport/connection_handler.rs | 19 - crates/core/src/transport/mod.rs | 7 - crates/core/src/transport/packet_data.rs | 11 - crates/core/src/transport/peer_connection.rs | 77 - 19 files changed, 34 insertions(+), 3670 deletions(-) delete mode 100644 crates/core/src/node/network_bridge/handshake.rs delete mode 100644 crates/core/src/node/network_bridge/handshake/tests.rs diff --git a/crates/core/src/message.rs b/crates/core/src/message.rs index 264011dfb..3e28dea5c 100644 --- a/crates/core/src/message.rs +++ b/crates/core/src/message.rs @@ -383,11 +383,6 @@ pub(crate) enum NodeEvent { ExpectPeerConnection { peer: PeerId, }, - /// Send a message to a peer over the network - SendMessage { - target: PeerId, - msg: Box, - }, } #[derive(Debug, Clone)] @@ -467,9 +462,6 @@ impl Display for NodeEvent { NodeEvent::ExpectPeerConnection { peer } => { write!(f, "ExpectPeerConnection (from {peer})") } - NodeEvent::SendMessage { target, msg } => { - write!(f, "SendMessage (to {target}, tx: {})", msg.id()) - } } } } diff --git a/crates/core/src/node/mod.rs b/crates/core/src/node/mod.rs index 240b66832..0076be7bc 100644 --- a/crates/core/src/node/mod.rs +++ b/crates/core/src/node/mod.rs @@ -1472,7 +1472,6 @@ pub trait IsOperationCompleted { impl IsOperationCompleted for OpEnum { fn is_completed(&self) -> bool { match self { - OpEnum::Connect(op) => op.is_completed(), OpEnum::ConnectV2(op) => op.is_completed(), OpEnum::Put(op) => op.is_completed(), OpEnum::Get(op) => op.is_completed(), diff --git a/crates/core/src/node/network_bridge.rs b/crates/core/src/node/network_bridge.rs index 0caa11d09..0ad76dbe0 100644 --- a/crates/core/src/node/network_bridge.rs +++ b/crates/core/src/node/network_bridge.rs @@ -16,7 +16,6 @@ use tokio::sync::mpsc::{self, Receiver, Sender}; use super::PeerId; use crate::message::{NetMessage, NodeEvent}; -mod handshake; mod handshake_v2; pub(crate) mod in_memory; pub(crate) mod p2p_protoc; diff --git a/crates/core/src/node/network_bridge/handshake.rs b/crates/core/src/node/network_bridge/handshake.rs deleted file mode 100644 index 821a4a68f..000000000 --- a/crates/core/src/node/network_bridge/handshake.rs +++ /dev/null @@ -1,1569 +0,0 @@ -#![allow(dead_code)] -//! Handles initial connection handshake. -use parking_lot::RwLock; -use std::{ - collections::{HashMap, HashSet}, - net::SocketAddr, - sync::{atomic::AtomicBool, Arc}, -}; -use tokio::time::{timeout, Duration}; -use tracing::Instrument; - -use futures::{future::BoxFuture, stream::FuturesUnordered, Future, FutureExt, TryFutureExt}; -use tokio::sync::mpsc::{self}; - -use crate::{ - dev_tool::{Location, PeerId, Transaction}, - message::{InnerMessage, NetMessage, NetMessageV1}, - node::NetworkBridge, - operations::connect::{ - forward_conn, ConnectMsg, ConnectOp, ConnectRequest, ConnectResponse, ConnectState, - ConnectivityInfo, ForwardParams, - }, - ring::{ConnectionManager, PeerKeyLocation, Ring}, - router::Router, - transport::{ - InboundConnectionHandler, OutboundConnectionHandler, PeerConnection, TransportError, - }, -}; - -type Result = std::result::Result; -type OutboundConnResult = Result; - -const TIMEOUT: Duration = Duration::from_secs(30); - -#[derive(Debug)] -pub(super) struct ForwardInfo { - pub target: PeerId, - pub msg: NetMessage, -} - -#[derive(Debug, thiserror::Error)] -pub(super) enum HandshakeError { - #[error("channel closed")] - ChannelClosed, - #[error("connection closed to {0}")] - ConnectionClosed(SocketAddr), - #[error(transparent)] - Serialization(#[from] Box), - #[error(transparent)] - TransportError(#[from] TransportError), - #[error("receibed an unexpected message at this point: {0}")] - UnexpectedMessage(Box), - #[error("connection error: {0}")] - ConnectionError(#[from] super::ConnectionError), -} - -#[derive(Debug)] -pub(super) enum Event { - /// An inbound connection to a peer was successfully established at a gateway. - InboundConnection { - id: Transaction, - conn: PeerConnection, - joiner: PeerId, - op: Option>, - forward_info: Option>, - /// If true, this is a gateway bootstrap acceptance that should be registered immediately. - /// See forward_conn() in connect.rs for full explanation. - is_bootstrap: bool, - }, - /// An outbound connection to a peer was successfully established. - OutboundConnectionSuccessful { - peer_id: PeerId, - connection: PeerConnection, - }, - /// An outbound connection to a peer failed to be established. - OutboundConnectionFailed { - peer_id: PeerId, - error: HandshakeError, - }, - /// An outbound connection to a gateway was rejected. - OutboundGatewayConnectionRejected { peer_id: PeerId }, - /// An inbound connection in a gateway was rejected. - InboundConnectionRejected { peer_id: PeerId }, - /// An outbound connection to a gateway was successfully established. It can be managed by the connection manager. - OutboundGatewayConnectionSuccessful { - peer_id: PeerId, - connection: PeerConnection, - remaining_checks: usize, - }, - /// Clean up a transaction that was completed or duplicate. - RemoveTransaction(Transaction), - /// Wait for replies via an other peer from forwarded connection attempts. - TransientForwardTransaction { - target: SocketAddr, - tx: Transaction, - forward_to: PeerId, - msg: Box, - }, -} - -/// NOTE: This enum is no longer used but kept for reference during transition. -/// The Stream implementation infers the forward result from forward_conn's ConnectState. -#[allow(dead_code, clippy::large_enum_variant)] -enum ForwardResult { - Forward(PeerId, NetMessage, ConnectivityInfo), - DirectlyAccepted(ConnectivityInfo), - /// Gateway bootstrap acceptance - connection should be registered immediately. - /// See forward_conn() in connect.rs and PR #1871 for context. - BootstrapAccepted(ConnectivityInfo), - Rejected, -} - -/// Use for sending messages to a peer which has not yet been confirmed at a logical level -/// or is just a transient connection (e.g. in case of gateways just forwarding messages). -pub(super) struct OutboundMessage(mpsc::Sender<(SocketAddr, NetMessage)>); - -impl OutboundMessage { - pub async fn send_to(&self, remote: SocketAddr, msg: NetMessage) -> Result<()> { - self.0 - .send((remote, msg)) - .await - .map_err(|_| HandshakeError::ChannelClosed)?; - Ok(()) - } -} - -pub(super) enum ExternConnection { - Establish { - peer: PeerId, - tx: Transaction, - is_gw: bool, - }, - Dropped { - peer: PeerId, - }, - #[allow(dead_code)] - DropConnectionByAddr(SocketAddr), -} - -/// Used for communicating with the HandshakeHandler. -pub(super) struct HanshakeHandlerMsg(pub(crate) mpsc::Sender); - -impl HanshakeHandlerMsg { - pub async fn establish_conn(&self, remote: PeerId, tx: Transaction, is_gw: bool) -> Result<()> { - self.0 - .send(ExternConnection::Establish { - peer: remote, - tx, - is_gw, - }) - .await - .map_err(|_| HandshakeError::ChannelClosed)?; - Ok(()) - } - - pub async fn drop_connection(&self, remote: PeerId) -> Result<()> { - self.0 - .send(ExternConnection::Dropped { peer: remote }) - .await - .map_err(|_| HandshakeError::ChannelClosed)?; - Ok(()) - } - - #[allow(dead_code)] - pub async fn drop_connection_by_addr(&self, remote_addr: SocketAddr) -> Result<()> { - self.0 - .send(ExternConnection::DropConnectionByAddr(remote_addr)) - .await - .map_err(|_| HandshakeError::ChannelClosed)?; - Ok(()) - } -} - -type OutboundMessageSender = mpsc::Sender; -type OutboundMessageReceiver = mpsc::Receiver<(SocketAddr, NetMessage)>; -type EstablishConnectionReceiver = mpsc::Receiver; - -/// Manages the handshake process for establishing connections with peers. -/// Handles both inbound and outbound connection attempts, and manages -/// the transition from unconfirmed to confirmed connections. -pub(super) struct HandshakeHandler { - /// Tracks ongoing connection attempts by their remote socket address - connecting: HashMap, - - /// Set of socket addresses for established connections - connected: HashSet, - - /// Handles incoming connections from the network - inbound_conn_handler: InboundConnectionHandler, - - /// Initiates outgoing connections to remote peers - outbound_conn_handler: OutboundConnectionHandler, - - /// Queue of ongoing outbound connection attempts - /// Used for non-gateway peers initiating connections - ongoing_outbound_connections: FuturesUnordered>, - - /// Queue of inbound connections not yet confirmed at the logical level - /// Used primarily by gateways for handling new peer join requests - unconfirmed_inbound_connections: FuturesUnordered< - BoxFuture<'static, Result<(InternalEvent, PeerOutboundMessage), HandshakeError>>, - >, - - /// Mapping of socket addresses to channels for sending messages to peers - /// Used for both confirmed and unconfirmed connections - outbound_messages: HashMap, - - /// Receiver for messages to be sent to peers not yet confirmed - /// Part of the OutboundMessage public API - pending_msg_rx: OutboundMessageReceiver, - - /// Receiver for commands to establish new outbound connections - /// Part of the EstablishConnection public API - establish_connection_rx: EstablishConnectionReceiver, - - /// Manages the node's connections and topology - connection_manager: ConnectionManager, - - /// Handles routing decisions within the network - router: Arc>, - - /// If set, will sent the location over network messages. - /// - /// It will also determine whether to trust the location of peers sent in network messages or derive them from IP. - /// - /// This is used for testing deterministically with given location. In production this should always be none - /// and locations should be derived from IP addresses. - this_location: Option, - - /// Whether this node is a gateway - is_gateway: bool, - - /// Indicates when peer is ready to process client operations (peer_id has been set). - /// Only used for non-gateway peers - set to Some(flag) for regular peers, None for gateways - peer_ready: Option>, -} - -impl HandshakeHandler { - pub fn new( - inbound_conn_handler: InboundConnectionHandler, - outbound_conn_handler: OutboundConnectionHandler, - connection_manager: ConnectionManager, - router: Arc>, - this_location: Option, - is_gateway: bool, - peer_ready: Option>, - ) -> (Self, HanshakeHandlerMsg, OutboundMessage) { - let (pending_msg_tx, pending_msg_rx) = tokio::sync::mpsc::channel(100); - let (establish_connection_tx, establish_connection_rx) = tokio::sync::mpsc::channel(100); - let connector = HandshakeHandler { - connecting: HashMap::new(), - connected: HashSet::new(), - inbound_conn_handler, - outbound_conn_handler, - ongoing_outbound_connections: FuturesUnordered::new(), - unconfirmed_inbound_connections: FuturesUnordered::new(), - outbound_messages: HashMap::new(), - pending_msg_rx, - establish_connection_rx, - connection_manager, - router, - this_location, - is_gateway, - peer_ready, - }; - ( - connector, - HanshakeHandlerMsg(establish_connection_tx), - OutboundMessage(pending_msg_tx), - ) - } - - /// Tracks a new inbound connection and sets up message handling for it. - fn track_inbound_connection(&mut self, conn: PeerConnection) { - let (outbound_msg_sender, outbound_msg_recv) = mpsc::channel(100); - let remote = conn.remote_addr(); - tracing::debug!(%remote, "Tracking inbound connection - spawning gw_peer_connection_listener"); - let f = gw_peer_connection_listener(conn, PeerOutboundMessage(outbound_msg_recv)).boxed(); - self.unconfirmed_inbound_connections.push(f); - self.outbound_messages.insert(remote, outbound_msg_sender); - tracing::debug!(%remote, "Inbound connection tracked - unconfirmed count: {}", self.unconfirmed_inbound_connections.len()); - } - - /// Handles outbound messages to peers. - async fn outbound(&mut self, addr: SocketAddr, op: NetMessage) -> Option { - if let Some(alive_conn) = self.outbound_messages.get_mut(&addr) { - if let NetMessage::V1(NetMessageV1::Connect(op)) = &op { - let tx = *op.id(); - if self - .connecting - .get(&addr) - .filter(|current_tx| *current_tx != &tx) - .is_some() - { - // avoid duplicate connection attempts - tracing::warn!("Duplicate connection attempt to {addr}, ignoring"); - return Some(Event::RemoveTransaction(tx)); - } - self.connecting.insert(addr, tx); - } - - if alive_conn.send(op).await.is_err() { - self.outbound_messages.remove(&addr); - self.connecting.remove(&addr); - } - None - } else { - let mut send_to_remote = None; - if let NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - msg: ConnectResponse::AcceptedBy { joiner, .. }, - .. - })) = &op - { - // this may be a reply message from a downstream peer to which it was forwarded previously - // for a transient connection, in this case we must send this message to the proper - // gw_transient_peer_conn future that is waiting for it - send_to_remote = Some(joiner.addr); - } - - if let Some(remote) = send_to_remote { - if let Some(addr) = self.outbound_messages.get_mut(&remote) { - if addr.send(op).await.is_err() { - tracing::warn!("Failed to send message to {addr}", addr = remote); - } - } else { - // this shouldn't happen really - tracing::error!("No outbound message sender for {addr}", addr = remote); - }; - return None; - } - - #[cfg(debug_assertions)] - { - unreachable!("Can't send messages to a peer without an established connection"); - } - #[cfg(not(debug_assertions))] - { - // we don't want to crash the node in case of a bug here - tracing::error!("No outbound message sender for {addr}", addr = addr); - None - } - } - } - - /// Starts an outbound connection to the given peer. - async fn start_outbound_connection( - &mut self, - remote: PeerId, - transaction: Transaction, - is_gw: bool, - ) { - if self.connected.contains(&remote.addr) { - tracing::warn!( - "Already connected to {}, ignore connection attempt", - remote.addr - ); - return; - } - self.connecting.insert(remote.addr, transaction); - tracing::debug!("Starting outbound connection to {addr}", addr = remote.addr); - let f = self - .outbound_conn_handler - .connect(remote.pub_key.clone(), remote.addr) - .await - .map(move |c| match c { - Ok(conn) if is_gw => { - tracing::debug!(%remote, "established outbound gw connection"); - Ok(InternalEvent::OutboundGwConnEstablished(remote, conn)) - } - Ok(conn) => { - tracing::debug!(%remote, "established outbound connection"); - Ok(InternalEvent::OutboundConnEstablished(remote, conn)) - } - Err(e) => { - tracing::debug!(%remote, "failed to establish outbound connection: {e}"); - Err((remote, e.into())) - } - }) - .boxed(); - self.ongoing_outbound_connections.push(f); - } -} - -/// Stream wrapper that takes ownership of HandshakeHandler and implements Stream properly. -/// This converts the event loop logic from wait_for_events into a proper Stream implementation. -pub(super) struct HandshakeEventStream { - handler: HandshakeHandler, -} - -impl HandshakeEventStream { - pub fn new(handler: HandshakeHandler) -> Self { - Self { handler } - } -} - -impl futures::stream::Stream for HandshakeEventStream { - type Item = Result; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - use std::task::Poll; - - let handler = &mut self.handler; - - // Main event loop - mirrors the original `loop { tokio::select! {...} }` structure - // We loop internally to handle "continue" cases without returning to the executor - loop { - tracing::trace!( - "HandshakeEventStream::poll_next iteration - unconfirmed: {}, ongoing_outbound: {}", - handler.unconfirmed_inbound_connections.len(), - handler.ongoing_outbound_connections.len() - ); - - // Priority 1: Handle new inbound connections - // Poll the future and extract the result, then drop it before using handler again - let inbound_result = { - let inbound_fut = handler.inbound_conn_handler.next_connection(); - tokio::pin!(inbound_fut); - inbound_fut.poll(cx) - }; // inbound_fut dropped here - - match inbound_result { - Poll::Ready(Some(conn)) => { - tracing::debug!(from=%conn.remote_addr(), "New inbound connection"); - handler.track_inbound_connection(conn); - // This was a `continue` in the loop - loop again to re-poll all priorities - continue; - } - Poll::Ready(None) => { - return Poll::Ready(Some(Err(HandshakeError::ChannelClosed))); - } - Poll::Pending => {} - } - - // Priority 2: Process outbound connection attempts - if !handler.ongoing_outbound_connections.is_empty() { - match std::pin::Pin::new(&mut handler.ongoing_outbound_connections).poll_next(cx) { - Poll::Ready(Some(outbound_result)) => { - // Handle the result - may return event or continue - let result = handle_outbound_result(handler, outbound_result, cx); - if let Some(event) = result { - return Poll::Ready(Some(event)); - } else { - // Was a continue case - loop again to re-poll all priorities - continue; - } - } - Poll::Ready(None) => { - // FuturesUnordered is now empty - this is normal, just continue to next channel - } - Poll::Pending => {} - } - } - - // Priority 3: Handle unconfirmed inbound connections (for gateways) - if !handler.unconfirmed_inbound_connections.is_empty() { - match std::pin::Pin::new(&mut handler.unconfirmed_inbound_connections).poll_next(cx) - { - Poll::Ready(Some(res)) => { - tracing::debug!("Processing unconfirmed inbound connection"); - let (event, outbound_sender) = match res { - Ok(v) => v, - Err(e) => return Poll::Ready(Some(Err(e))), - }; - tracing::debug!("Unconfirmed connection event: {:?}", event); - let result = - handle_unconfirmed_inbound(handler, event, outbound_sender, cx); - if let Some(event) = result { - return Poll::Ready(Some(event)); - } else { - // Was a continue case - loop again to re-poll all priorities - continue; - } - } - Poll::Ready(None) => { - // FuturesUnordered is now empty - this is normal, just continue to next channel - } - Poll::Pending => {} - } - } - - // Priority 4: Handle outbound message requests - match handler.pending_msg_rx.poll_recv(cx) { - Poll::Ready(Some((addr, msg))) => { - // Call handler.outbound() - this returns Option - // Scope to drop the future borrow immediately - let result = { - let outbound_fut = handler.outbound(addr, msg); - tokio::pin!(outbound_fut); - outbound_fut.poll(cx) - }; - match result { - Poll::Ready(Some(event)) => { - return Poll::Ready(Some(Ok(event))); - } - Poll::Ready(None) => { - // outbound() returned None - continue to re-poll all priorities - continue; - } - Poll::Pending => { - // The outbound future is pending - continue to next priority - } - } - } - Poll::Ready(None) => { - return Poll::Ready(Some(Err(HandshakeError::ChannelClosed))); - } - Poll::Pending => {} - } - - // Priority 5: Handle connection establishment requests - match handler.establish_connection_rx.poll_recv(cx) { - Poll::Ready(Some(ExternConnection::Establish { peer, tx, is_gw })) => { - // Start outbound connection - call the async method - // Scope to drop the future borrow immediately - let _ = { - let start_fut = handler.start_outbound_connection(peer, tx, is_gw); - tokio::pin!(start_fut); - start_fut.poll(cx) - }; - // Poll it immediately - it will push futures to ongoing_outbound_connections - // Then loop again to re-poll all priorities (ongoing_outbound_connections might have work) - continue; - } - Poll::Ready(Some(ExternConnection::Dropped { peer })) => { - handler.connected.remove(&peer.addr); - handler.outbound_messages.remove(&peer.addr); - handler.connecting.remove(&peer.addr); - // Continue to re-poll all priorities - continue; - } - Poll::Ready(Some(ExternConnection::DropConnectionByAddr(addr))) => { - handler.connected.remove(&addr); - handler.outbound_messages.remove(&addr); - handler.connecting.remove(&addr); - // Continue to re-poll all priorities - continue; - } - Poll::Ready(None) => { - return Poll::Ready(Some(Err(HandshakeError::ChannelClosed))); - } - Poll::Pending => {} - } - - // All channels are pending - return Pending and wait to be woken - return Poll::Pending; - } // end of loop - } -} - -// Helper to handle outbound connection results -// Returns Some(event) if should return an event, None if should continue -fn handle_outbound_result( - handler: &mut HandshakeHandler, - result: OutboundConnResult, - cx: &mut std::task::Context<'_>, -) -> Option> { - match result { - Ok(InternalEvent::OutboundConnEstablished(peer_id, connection)) => { - tracing::info!(at=?connection.my_address(), from=%connection.remote_addr(), "Outbound connection successful"); - Some(Ok(Event::OutboundConnectionSuccessful { - peer_id, - connection, - })) - } - Ok(InternalEvent::OutboundGwConnEstablished(id, connection)) => { - tracing::info!(at=?connection.my_address(), from=%connection.remote_addr(), "Outbound gateway connection successful"); - if let Some(addr) = connection.my_address() { - tracing::debug!(%addr, "Attempting setting own peer key"); - handler.connection_manager.try_set_peer_key(addr); - - if let Some(ref peer_ready) = handler.peer_ready { - peer_ready.store(true, std::sync::atomic::Ordering::SeqCst); - tracing::info!("Peer initialization complete: peer_ready set to true, client operations now enabled"); - } - - if handler.this_location.is_none() { - handler - .connection_manager - .update_location(Some(Location::from_address(&addr))); - } - } - tracing::debug!(at=?connection.my_address(), from=%connection.remote_addr(), "Outbound connection to gw successful"); - - // Call wait_for_gw_confirmation - it pushes a future to ongoing_outbound_connections - let tx = match handler.connecting.get(&id.addr) { - Some(t) => *t, - None => { - tracing::error!("Transaction not found for gateway connection"); - return Some(Err(HandshakeError::ConnectionClosed( - connection.remote_addr(), - ))); - } - }; - let this_peer = handler.connection_manager.own_location().peer; - tracing::debug!(at=?connection.my_address(), %this_peer.addr, from=%connection.remote_addr(), remote_addr = %id, "Waiting for confirmation from gw"); - handler.ongoing_outbound_connections.push( - wait_for_gw_confirmation( - (this_peer, handler.this_location), - AcceptedTracker { - gw_peer: id.into(), - gw_conn: connection, - gw_accepted: false, - gw_accepted_processed: false, - remaining_checks: Ring::DEFAULT_MAX_HOPS_TO_LIVE, - accepted: 0, - total_checks: Ring::DEFAULT_MAX_HOPS_TO_LIVE, - tx, - }, - ) - .boxed(), - ); - None // Continue - } - Ok(InternalEvent::FinishedOutboundConnProcess(tracker)) => { - handler.connecting.remove(&tracker.gw_peer.peer.addr); - tracing::debug!(at=?tracker.gw_conn.my_address(), gw=%tracker.gw_conn.remote_addr(), "Done checking, connection not accepted by gw, dropping connection"); - Some(Ok(Event::OutboundGatewayConnectionRejected { - peer_id: tracker.gw_peer.peer, - })) - } - Ok(InternalEvent::OutboundGwConnConfirmed(tracker)) => { - tracing::debug!(at=?tracker.gw_conn.my_address(), from=%tracker.gw_conn.remote_addr(), "Outbound connection to gw confirmed"); - handler.connected.insert(tracker.gw_conn.remote_addr()); - handler.connecting.remove(&tracker.gw_conn.remote_addr()); - Some(Ok(Event::OutboundGatewayConnectionSuccessful { - peer_id: tracker.gw_peer.peer, - connection: tracker.gw_conn, - remaining_checks: tracker.remaining_checks, - })) - } - Ok(InternalEvent::NextCheck(tracker)) => { - handler - .ongoing_outbound_connections - .push(check_remaining_hops(tracker).boxed()); - None // Continue - } - Ok(InternalEvent::RemoteConnectionAttempt { remote, tracker }) => { - debug_assert!(!tracker.gw_accepted); - tracing::debug!( - at=?tracker.gw_conn.my_address(), - gw=%tracker.gw_conn.remote_addr(), - "Attempting remote connection to {remote}" - ); - - // Start outbound connection - poll it immediately to start the work - let _result = { - let start_fut = - handler.start_outbound_connection(remote.clone(), tracker.tx, false); - tokio::pin!(start_fut); - start_fut.poll(cx) - }; - - // Whether it completes or pends, push check_remaining_hops - let current_span = tracing::Span::current(); - let checking_hops_span = tracing::info_span!(parent: current_span, "checking_hops"); - handler.ongoing_outbound_connections.push( - check_remaining_hops(tracker) - .instrument(checking_hops_span) - .boxed(), - ); - None // Continue - } - Ok(InternalEvent::DropInboundConnection(addr)) => { - handler.connecting.remove(&addr); - handler.outbound_messages.remove(&addr); - None // Continue - } - Err((peer_id, error)) => { - tracing::debug!(from=%peer_id.addr, "Outbound connection failed: {error}"); - tracing::info!(from=%peer_id.addr, error = ?error, "Outbound connection failed"); - handler.connecting.remove(&peer_id.addr); - handler.outbound_messages.remove(&peer_id.addr); - handler.connection_manager.prune_alive_connection(&peer_id); - Some(Ok(Event::OutboundConnectionFailed { peer_id, error })) - } - Ok(other) => { - tracing::error!("Unexpected event: {other:?}"); - None // Continue - } - } -} - -// Helper to handle unconfirmed inbound events -// Returns Some(event) if should return, None if should continue -fn handle_unconfirmed_inbound( - handler: &mut HandshakeHandler, - event: InternalEvent, - outbound_sender: PeerOutboundMessage, - _cx: &mut std::task::Context<'_>, -) -> Option> { - match event { - InternalEvent::InboundGwJoinRequest(req) => { - // This requires async work - spawn it as a future - let conn_manager = handler.connection_manager.clone(); - let router = handler.router.clone(); - let this_location = handler.this_location; - let is_gateway = handler.is_gateway; - - // Spawn the async handling - let fut = handle_inbound_gw_join_request( - req, - conn_manager, - router, - this_location, - is_gateway, - outbound_sender, - ); - - handler.unconfirmed_inbound_connections.push(fut.boxed()); - None - } - InternalEvent::InboundConnectionAccepted { - id, - conn, - joiner, - op, - forward_info, - is_bootstrap, - } => { - tracing::debug!(%joiner, "Inbound connection accepted"); - // The outbound sender was already stored in outbound_messages by track_inbound_connection - // We just need to return the event - Some(Ok(Event::InboundConnection { - id, - conn, - joiner, - op, - forward_info, - is_bootstrap, - })) - } - InternalEvent::InboundConnectionRejected { peer_id, remote } => { - tracing::debug!(%peer_id, %remote, "Inbound connection rejected"); - handler.outbound_messages.remove(&remote); - handler.connecting.remove(&remote); - Some(Ok(Event::InboundConnectionRejected { peer_id })) - } - InternalEvent::TransientForward { - conn, - tx, - info, - target, - forward_to, - msg, - } => { - tracing::debug!(%target, %forward_to, "Transient forward"); - // Save transaction ID before moving tx - let transaction_id = tx.tx; - // Push gw_transient_peer_conn future to monitor this connection - handler - .unconfirmed_inbound_connections - .push(gw_transient_peer_conn(conn, outbound_sender, tx, info).boxed()); - Some(Ok(Event::TransientForwardTransaction { - target, - tx: transaction_id, - forward_to, - msg, - })) - } - InternalEvent::DropInboundConnection(addr) => { - tracing::debug!(%addr, "Dropping inbound connection"); - handler.outbound_messages.remove(&addr); - None - } - _ => { - tracing::warn!("Unhandled unconfirmed inbound event: {:?}", event); - None - } - } -} - -// Async function to handle InboundGwJoinRequest -async fn handle_inbound_gw_join_request( - mut req: InboundGwJoinRequest, - conn_manager: ConnectionManager, - router: Arc>, - this_location: Option, - is_gateway: bool, - outbound_sender: PeerOutboundMessage, -) -> Result<(InternalEvent, PeerOutboundMessage), HandshakeError> { - let location = if let Some((_, other)) = this_location.zip(req.location) { - other - } else { - Location::from_address(&req.conn.remote_addr()) - }; - - let should_accept = conn_manager.should_accept(location, &req.joiner); - let can_accept = should_accept && (is_gateway || conn_manager.num_connections() > 0); - - if can_accept { - // Accepted connection path: Send acceptance message, then forward - let accepted_msg = NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - id: req.id, - sender: conn_manager.own_location(), - target: PeerKeyLocation { - peer: req.joiner.clone(), - location: Some(location), - }, - msg: ConnectResponse::AcceptedBy { - accepted: true, - acceptor: conn_manager.own_location(), - joiner: req.joiner.clone(), - }, - })); - - tracing::debug!(at=?req.conn.my_address(), from=%req.conn.remote_addr(), "Accepting connection"); - - if let Err(e) = req.conn.send(accepted_msg).await { - tracing::error!(%e, "Failed to send accepted message from gw, pruning reserved connection"); - conn_manager.prune_in_transit_connection(&req.joiner); - return Err(e.into()); - } - - let InboundGwJoinRequest { - conn, - id, - hops_to_live, - max_hops_to_live, - skip_connections, - skip_forwards, - joiner, - .. - } = req; - - // Forward the connection - let mut nw_bridge = ForwardPeerMessage { - msg: parking_lot::Mutex::new(None), - }; - - let my_peer_id = conn_manager.own_location(); - let joiner_pk_loc = PeerKeyLocation { - peer: joiner.clone(), - location: Some(location), - }; - - let mut skip_connections = skip_connections.clone(); - let mut skip_forwards = skip_forwards.clone(); - skip_connections.insert(my_peer_id.peer.clone()); - skip_forwards.insert(my_peer_id.peer.clone()); - - let forward_info = ForwardParams { - left_htl: hops_to_live, - max_htl: max_hops_to_live, - accepted: true, - skip_connections, - skip_forwards, - req_peer: my_peer_id.clone(), - joiner: joiner_pk_loc.clone(), - is_gateway, - }; - - match forward_conn( - id, - &conn_manager, - router.clone(), - &mut nw_bridge, - forward_info, - ) - .await - { - Err(err) => { - tracing::error!(%err, "Error forwarding connection"); - // Continue by returning DropInboundConnection - Ok(( - InternalEvent::DropInboundConnection(conn.remote_addr()), - outbound_sender, - )) - } - Ok(Some(conn_state)) => { - let ConnectState::AwaitingConnectivity(info) = conn_state else { - unreachable!("forward_conn should return AwaitingConnectivity if successful") - }; - - tracing::info!(%id, %joiner, "Creating InboundConnection event"); - - // Check if we have a forward message (forwarding) or not (direct acceptance) - let (op, forward_info_opt, is_bootstrap) = - if let Some((forward_target, msg)) = nw_bridge.msg.into_inner() { - ( - Some(Box::new(ConnectOp::new( - id, - Some(ConnectState::AwaitingConnectivity(info)), - None, - None, - ))), - Some(Box::new(ForwardInfo { - target: forward_target, - msg, - })), - false, - ) - } else if info.is_bootstrap_acceptance { - // Gateway bootstrap case: connection should be registered immediately - ( - Some(Box::new(ConnectOp::new( - id, - Some(ConnectState::AwaitingConnectivity(info)), - None, - None, - ))), - None, - true, - ) - } else { - // Normal direct acceptance - will wait for CheckConnectivity - ( - Some(Box::new(ConnectOp::new( - id, - Some(ConnectState::AwaitingConnectivity(info)), - None, - None, - ))), - None, - false, - ) - }; - - Ok(( - InternalEvent::InboundConnectionAccepted { - id, - conn, - joiner, - op, - forward_info: forward_info_opt, - is_bootstrap, - }, - outbound_sender, - )) - } - Ok(None) => { - // No forwarding target found - return event with op: None to signal rejection - // This matches original behavior where forward_result (None, _) returns Event with op: None - Ok(( - InternalEvent::InboundConnectionAccepted { - id, - conn, - joiner, - op: None, // Signals rejection/no forwarding possible - forward_info: None, - is_bootstrap: false, - }, - outbound_sender, - )) - } - } - } else { - // Transient connection path: Try to forward without accepting - // If should_accept was true but we can't actually accept (non-gateway with 0 connections), - // we need to clean up the reserved connection - if should_accept && !can_accept { - conn_manager.prune_in_transit_connection(&req.joiner); - tracing::debug!( - "Non-gateway with 0 connections cannot accept connection from {:?}", - req.joiner - ); - } - - let InboundGwJoinRequest { - mut conn, - id, - hops_to_live, - max_hops_to_live, - skip_connections, - skip_forwards, - joiner, - .. - } = req; - - let remote = conn.remote_addr(); - tracing::debug!(at=?conn.my_address(), from=%remote, "Transient connection"); - - // Try to forward the connection without accepting it - let joiner_loc = this_location.unwrap_or_else(|| Location::from_address(&remote)); - let joiner_pk_loc = PeerKeyLocation { - peer: joiner.clone(), - location: Some(joiner_loc), - }; - let my_peer_id = conn_manager.own_location(); - - let mut skip_connections_updated = skip_connections.clone(); - let mut skip_forwards_updated = skip_forwards.clone(); - skip_connections_updated.insert(joiner.clone()); - skip_forwards_updated.insert(joiner.clone()); - skip_connections_updated.insert(my_peer_id.peer.clone()); - skip_forwards_updated.insert(my_peer_id.peer.clone()); - - let forward_info = ForwardParams { - left_htl: hops_to_live, - max_htl: max_hops_to_live, - accepted: true, - skip_connections: skip_connections_updated, - skip_forwards: skip_forwards_updated, - req_peer: my_peer_id.clone(), - joiner: joiner_pk_loc.clone(), - is_gateway, - }; - - let mut nw_bridge = ForwardPeerMessage { - msg: parking_lot::Mutex::new(None), - }; - - match forward_conn( - id, - &conn_manager, - router.clone(), - &mut nw_bridge, - forward_info, - ) - .await - { - Ok(Some(conn_state)) => { - let ConnectState::AwaitingConnectivity(info) = conn_state else { - unreachable!("forward_conn should return AwaitingConnectivity if successful") - }; - - // Check the forwarding result - if let Some((forward_target, msg)) = nw_bridge.msg.into_inner() { - // Successfully forwarding to another peer - // Create a TransientConnection to track this - let tx = TransientConnection { - tx: id, - joiner: joiner.clone(), - }; - - // Push gw_transient_peer_conn future to monitor this connection - Ok(( - InternalEvent::TransientForward { - conn, - tx, - info, - target: remote, - forward_to: forward_target, - msg: Box::new(msg), - }, - outbound_sender, - )) - } else if info.is_bootstrap_acceptance { - // Bootstrap acceptance - accept it directly even though we didn't send acceptance yet - Ok(( - InternalEvent::InboundConnectionAccepted { - id, - conn, - joiner, - op: Some(Box::new(ConnectOp::new( - id, - Some(ConnectState::AwaitingConnectivity(info)), - None, - None, - ))), - forward_info: None, - is_bootstrap: true, - }, - outbound_sender, - )) - } else { - // Direct acceptance without forwarding - shouldn't happen for transient - // Clean up and reject - conn_manager.prune_in_transit_connection(&joiner); - Ok(( - InternalEvent::InboundConnectionRejected { - peer_id: joiner, - remote, - }, - outbound_sender, - )) - } - } - Ok(None) => { - // No peer to forward to - send rejection message - tracing::debug!(at=?conn.my_address(), from=%conn.remote_addr(), "Rejecting connection, no peers found to forward"); - let reject_msg = NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - id, - sender: my_peer_id.clone(), - target: joiner_pk_loc, - msg: ConnectResponse::AcceptedBy { - accepted: false, - acceptor: my_peer_id, - joiner: joiner.clone(), - }, - })); - - if let Err(e) = conn.send(reject_msg).await { - tracing::error!(%e, "Failed to send rejection message"); - return Err(e.into()); - } - - // Clean up and reject - conn_manager.prune_in_transit_connection(&joiner); - Ok(( - InternalEvent::InboundConnectionRejected { - peer_id: joiner, - remote, - }, - outbound_sender, - )) - } - Err(e) => { - tracing::error!(from=%remote, "Error forwarding transient connection: {e}"); - // Drop the connection and clean up - conn_manager.prune_in_transit_connection(&joiner); - Ok(( - InternalEvent::DropInboundConnection(remote), - outbound_sender, - )) - } - } - } -} - -// Attempt forwarding the connection request to the next hop and wait for answers -// then return those answers to the transitory peer connection. -struct ForwardPeerMessage { - msg: parking_lot::Mutex>, -} - -impl NetworkBridge for ForwardPeerMessage { - async fn send(&self, target: &PeerId, forward_msg: NetMessage) -> super::ConnResult<()> { - debug_assert!(matches!( - forward_msg, - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - msg: ConnectRequest::CheckConnectivity { .. }, - .. - })) - )); - self.msg - .try_lock() - .expect("unique ref") - .replace((target.clone(), forward_msg)); - Ok(()) - } - - async fn drop_connection(&mut self, _: &PeerId) -> super::ConnResult<()> { - if cfg!(debug_assertions) { - unreachable!("drop_connection should not be called on ForwardPeerMessage") - } - Ok(()) - } -} - -#[derive(Debug)] -struct InboundGwJoinRequest { - conn: PeerConnection, - id: Transaction, - joiner: PeerId, - location: Option, - hops_to_live: usize, - max_hops_to_live: usize, - skip_connections: HashSet, - skip_forwards: HashSet, -} - -#[derive(Debug)] -enum InternalEvent { - InboundGwJoinRequest(InboundGwJoinRequest), - /// Regular connection established - OutboundConnEstablished(PeerId, PeerConnection), - OutboundGwConnEstablished(PeerId, PeerConnection), - OutboundGwConnConfirmed(AcceptedTracker), - DropInboundConnection(SocketAddr), - RemoteConnectionAttempt { - remote: PeerId, - tracker: AcceptedTracker, - }, - NextCheck(AcceptedTracker), - FinishedOutboundConnProcess(AcceptedTracker), - // New variants for forwarding results - InboundConnectionAccepted { - id: Transaction, - conn: PeerConnection, - joiner: PeerId, - op: Option>, - forward_info: Option>, - is_bootstrap: bool, - }, - InboundConnectionRejected { - peer_id: PeerId, - remote: SocketAddr, - }, - TransientForward { - conn: PeerConnection, - tx: TransientConnection, - info: ConnectivityInfo, - target: SocketAddr, - forward_to: PeerId, - msg: Box, - }, -} - -#[repr(transparent)] -#[derive(Debug)] -struct PeerOutboundMessage(mpsc::Receiver); - -#[derive(Debug)] -struct AcceptedTracker { - gw_peer: PeerKeyLocation, - gw_conn: PeerConnection, - gw_accepted_processed: bool, - gw_accepted: bool, - /// Remaining checks to be made, at max total_checks - remaining_checks: usize, - /// At max this will be total_checks - accepted: usize, - /// Equivalent to max_hops_to_live - total_checks: usize, - tx: Transaction, -} - -/// Waits for confirmation from a gateway after initiating a connection. -async fn wait_for_gw_confirmation( - (this_peer, this_location): (PeerId, Option), - mut tracker: AcceptedTracker, -) -> OutboundConnResult { - let gw_peer_id = tracker.gw_peer.peer.clone(); - let msg = NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - id: tracker.tx, - target: tracker.gw_peer.clone(), - msg: ConnectRequest::StartJoinReq { - joiner: Some(this_peer.clone()), - joiner_key: this_peer.pub_key.clone(), - joiner_location: this_location, - hops_to_live: tracker.total_checks, - max_hops_to_live: tracker.total_checks, - skip_connections: HashSet::from([this_peer.clone()]), - skip_forwards: HashSet::from([this_peer.clone()]), - }, - })); - tracing::debug!( - at=?tracker.gw_conn.my_address(), - from=%tracker.gw_conn.remote_addr(), - msg = ?msg, - "Sending initial connection message to gw" - ); - tracker - .gw_conn - .send(msg) - .await - .map_err(|err| (gw_peer_id.clone(), HandshakeError::TransportError(err)))?; - tracing::debug!( - at=?tracker.gw_conn.my_address(), - from=%tracker.gw_conn.remote_addr(), - "Waiting for answer from gw" - ); - - // under this branch we just need to wait long enough for the gateway to reply with all the downstream - // connection attempts, and then we can drop the connection, so keep listening to it in a loop or timeout - let remote = tracker.gw_conn.remote_addr(); - tokio::time::timeout( - TIMEOUT, - check_remaining_hops(tracker), - ) - .await - .map_err(|_| { - tracing::debug!(from=%gw_peer_id, "Timed out waiting for acknowledgement from downstream requests"); - ( - gw_peer_id, - HandshakeError::ConnectionClosed(remote), - ) - })? -} - -async fn check_remaining_hops(mut tracker: AcceptedTracker) -> OutboundConnResult { - let remote_addr = tracker.gw_conn.remote_addr(); - let gw_peer_id = tracker.gw_peer.peer.clone(); - tracing::debug!( - at=?tracker.gw_conn.my_address(), - from=%tracker.gw_conn.remote_addr(), - "Checking for remaining hops, left: {}", tracker.remaining_checks - ); - while tracker.remaining_checks > 0 { - let msg = tokio::time::timeout( - TIMEOUT, - tracker - .gw_conn - .recv() - .map_err(|err| (gw_peer_id.clone(), HandshakeError::TransportError(err))), - ) - .map_err(|_| { - tracing::debug!(from = %gw_peer_id, "Timed out waiting for response from gw"); - ( - gw_peer_id.clone(), - HandshakeError::ConnectionClosed(remote_addr), - ) - }) - .await??; - let msg = decode_msg(&msg).map_err(|e| (gw_peer_id.clone(), e))?; - match msg { - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - msg: - ConnectResponse::AcceptedBy { - accepted, acceptor, .. - }, - .. - })) => { - tracker.remaining_checks -= 1; - if acceptor.peer.addr == tracker.gw_conn.remote_addr() { - // this is a message from the gw indicating if they accepted or not - tracker.gw_accepted_processed = true; - if accepted { - tracker.gw_accepted = true; - tracker.accepted += 1; - } - tracing::debug!( - at = ?tracker.gw_conn.my_address(), - from = %tracker.gw_conn.remote_addr(), - %accepted, - "Received answer from gw" - ); - if accepted { - return Ok(InternalEvent::OutboundGwConnConfirmed(tracker)); - } else { - tracing::debug!("Rejected by gateway, waiting for forward replies"); - return Ok(InternalEvent::NextCheck(tracker)); - } - } else if accepted { - return Ok(InternalEvent::RemoteConnectionAttempt { - remote: acceptor.peer, - tracker, - }); - } else { - continue; - } - } - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - msg: ConnectRequest::FindOptimalPeer { .. }, - .. - })) => { - tracing::warn!(from=%tracker.gw_conn.remote_addr(), "Received FindOptimalPeer request, ignoring"); - continue; - } - other => { - return Err(( - gw_peer_id, - HandshakeError::UnexpectedMessage(Box::new(other)), - )) - } - } - } - Ok(InternalEvent::FinishedOutboundConnProcess(tracker)) -} - -/// Handles communication with a potentially transient peer connection. -/// Used primarily by gateways to manage connections in the process of joining the network. -async fn gw_peer_connection_listener( - mut conn: PeerConnection, - mut outbound: PeerOutboundMessage, -) -> Result<(InternalEvent, PeerOutboundMessage), HandshakeError> { - tracing::debug!(from=%conn.remote_addr(), "Starting gw_peer_connection_listener"); - loop { - tokio::select! { - msg = outbound.0.recv() => { - let Some(msg) = msg else { break Err(HandshakeError::ConnectionClosed(conn.remote_addr())); }; - - tracing::debug!(at=?conn.my_address(), from=%conn.remote_addr() ,"Sending message to peer. Msg: {msg}"); - conn - .send(msg) - .await?; - } - msg = conn.recv() => { - let Ok(msg) = msg.map_err(|error| { - tracing::error!(at=?conn.my_address(), from=%conn.remote_addr(), "Error while receiving message: {error}"); - }) else { - break Err(HandshakeError::ConnectionClosed(conn.remote_addr())); - }; - let net_message = match decode_msg(&msg) { - Ok(msg) => msg, - Err(e) => { - tracing::error!( - at=?conn.my_address(), - from=%conn.remote_addr(), - error=%e, - "Failed to decode message - closing connection" - ); - break Err(HandshakeError::ConnectionClosed(conn.remote_addr())); - } - }; - tracing::debug!(at=?conn.my_address(), from=%conn.remote_addr(), %net_message, "Received message from peer"); - match net_message { - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - id, - msg: ConnectRequest::StartJoinReq { - joiner, - joiner_key, - hops_to_live, - max_hops_to_live, - skip_connections, - skip_forwards, - joiner_location - }, - .. - })) => { - let joiner = joiner.unwrap_or_else(|| { - tracing::debug!(%joiner_key, "Joiner not provided, using joiner key"); - PeerId::new(conn.remote_addr(), joiner_key) - }); - break Ok(( - InternalEvent::InboundGwJoinRequest(InboundGwJoinRequest { - conn, - id, - joiner, - location: joiner_location, - hops_to_live, - max_hops_to_live, - skip_connections, - skip_forwards, - }), - outbound, - )); - } - other => { - tracing::warn!( - at=?conn.my_address(), - from=%conn.remote_addr(), - %other, - "Unexpected message received from peer, terminating connection" - ); - break Err(HandshakeError::ConnectionClosed(conn.remote_addr())); - } - } - } - } - } -} - -/// Manages a transient connection during the joining process. -/// Handles forwarding of connection requests and tracking of responses. -async fn gw_transient_peer_conn( - mut conn: PeerConnection, - mut outbound: PeerOutboundMessage, - transaction: TransientConnection, - mut info: ConnectivityInfo, -) -> Result<(InternalEvent, PeerOutboundMessage), HandshakeError> { - // TODO: should be the same timeout as the one used for any other tx - loop { - tokio::select! { - incoming_result = timeout(TIMEOUT, conn.recv()) => { - match incoming_result { - Ok(Ok(msg)) => { - let net_msg = match decode_msg(&msg) { - Ok(msg) => msg, - Err(e) => { - tracing::error!( - at=?conn.my_address(), - from=%conn.remote_addr(), - error=%e, - "Failed to decode message from transient peer - closing connection" - ); - break Err(HandshakeError::ConnectionClosed(conn.remote_addr())); - } - }; - if transaction.is_drop_connection_message(&net_msg) { - tracing::debug!("Received drop connection message"); - break Ok((InternalEvent::DropInboundConnection(conn.remote_addr()), outbound)); - } else { - tracing::warn!( - at=?conn.my_address(), - from=%conn.remote_addr(), - %net_msg, - "Unexpected message received from peer, terminating connection" - ); - break Err(HandshakeError::ConnectionClosed(conn.remote_addr())); - } - } - Ok(Err(e)) => { - tracing::error!("Error receiving message: {:?}", e); - break Ok((InternalEvent::DropInboundConnection(conn.remote_addr()), outbound)); - } - Err(_) => { - tracing::debug!("Transient connection timed out"); - break Ok((InternalEvent::DropInboundConnection(conn.remote_addr()), outbound)); - } - } - } - outbound_msg = timeout(TIMEOUT, outbound.0.recv()) => { - match outbound_msg { - Ok(Some(msg)) => { - if matches!( - msg, - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { msg: ConnectResponse::AcceptedBy { .. }, .. })) - ) { - let NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - id, - target, - msg: ConnectResponse::AcceptedBy { accepted, acceptor, joiner }, - .. - })) = msg else { - unreachable!("Expected ConnectResponse::AcceptedBy after matches! guard") - }; - // in this case it may be a reply of a third party we forwarded to, - // and need to send that back to the joiner and count the reply - let msg = NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - id, - sender: target, - target: acceptor.clone(), - msg: ConnectResponse::AcceptedBy { - accepted, - acceptor, - joiner, - }, - })); - conn.send(msg).await?; - if info.decrement_check() { // this means all checks have been performed - break Ok((InternalEvent::DropInboundConnection(conn.remote_addr()), outbound)); - } else { // still waiting for more checks - continue; - } - } - // other messages are just forwarded - conn.send(msg).await?; - } - Ok(None) => { - tracing::debug!("Outbound channel closed for transient connection"); - break Ok((InternalEvent::DropInboundConnection(conn.remote_addr()), outbound)); - } - Err(_) => { - tracing::debug!("Transient connection timed out"); - break Ok((InternalEvent::DropInboundConnection(conn.remote_addr()), outbound)); - } - } - } - } - } -} - -/// Tracks a transient connection that is being forwarded through this gateway. -/// This struct is only used by `gw_transient_peer_conn` to identify and validate -/// drop connection messages from the joiner. -/// -/// Note: In the original implementation, this struct also contained `max_hops_to_live`, -/// `hops_to_live`, `skip_connections`, and `skip_forwards` fields that were used by -/// the `forward_transient_connection` method. In the stream-based refactoring, these -/// values are used directly from the `InboundGwJoinRequest` when calling `forward_conn`, -/// so they don't need to be stored in this struct. -#[derive(Debug)] -struct TransientConnection { - tx: Transaction, - joiner: PeerId, -} - -impl TransientConnection { - fn is_drop_connection_message(&self, net_message: &NetMessage) -> bool { - if let NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - id, - msg: ConnectRequest::CleanConnection { joiner }, - .. - })) = net_message - { - // this peer should never be receiving messages for other transactions or other peers at this point - debug_assert_eq!(id, &self.tx); - debug_assert_eq!(joiner.peer, self.joiner); - - if id != &self.tx || joiner.peer != self.joiner { - return false; - } - return true; - } - false - } -} - -#[inline(always)] -fn decode_msg(data: &[u8]) -> Result { - bincode::deserialize(data).map_err(HandshakeError::Serialization) -} - -#[cfg(test)] -mod tests; diff --git a/crates/core/src/node/network_bridge/handshake/tests.rs b/crates/core/src/node/network_bridge/handshake/tests.rs deleted file mode 100644 index e6aa30cf9..000000000 --- a/crates/core/src/node/network_bridge/handshake/tests.rs +++ /dev/null @@ -1,651 +0,0 @@ -use core::panic; -use std::{fmt::Display, sync::Arc, time::Duration}; - -use aes_gcm::{Aes128Gcm, KeyInit}; -use anyhow::{anyhow, bail}; -use serde::Serialize; -use tokio::sync::{mpsc, oneshot}; - -use super::*; -use crate::{ - dev_tool::TransportKeypair, - operations::connect::{ConnectMsg, ConnectResponse}, - ring::{Connection, PeerKeyLocation, Ring}, - transport::{ - ConnectionEvent, OutboundConnectionHandler, PacketData, RemoteConnection, SymmetricMessage, - SymmetricMessagePayload, TransportPublicKey, UnknownEncryption, - }, -}; - -struct TransportMock { - inbound_sender: mpsc::Sender, - outbound_recv: mpsc::Receiver<(SocketAddr, ConnectionEvent)>, - /// Outbount messages to peers - packet_senders: HashMap>)>, - /// Next packet id to use - packet_id: u32, - /// Inbound messages from peers - packet_receivers: Vec)>>, - in_key: Aes128Gcm, - my_addr: SocketAddr, -} - -impl TransportMock { - async fn new_conn(&mut self, addr: SocketAddr) { - let out_symm_key = Aes128Gcm::new_from_slice(&[0; 16]).unwrap(); - let in_symm_key = Aes128Gcm::new_from_slice(&[1; 16]).unwrap(); - let (conn, packet_sender, packet_recv) = - PeerConnection::new_test(addr, self.my_addr, out_symm_key, in_symm_key.clone()); - self.inbound_sender.send(conn).await.unwrap(); - tracing::debug!("New inbound connection established"); - self.packet_senders - .insert(addr, (in_symm_key, packet_sender)); - self.packet_receivers.push(packet_recv); - } - - async fn new_outbound_conn( - &mut self, - addr: SocketAddr, - callback: oneshot::Sender>, - ) { - let out_symm_key = Aes128Gcm::new_from_slice(&[0; 16]).unwrap(); - let in_symm_key = Aes128Gcm::new_from_slice(&[1; 16]).unwrap(); - let (conn, packet_sender, packet_recv) = - PeerConnection::new_remote_test(addr, self.my_addr, out_symm_key, in_symm_key.clone()); - callback - .send(Ok(conn)) - .map_err(|_| "Failed to send connection") - .unwrap(); - tracing::debug!("New outbound connection established"); - self.packet_senders - .insert(addr, (in_symm_key, packet_sender)); - self.packet_receivers.push(packet_recv); - } - - /// This would happen when a new unsolicited connection is established with a gateway or - /// when after initialising a connection with a peer via `outbound_recv`, a connection - /// is successfully established. - async fn establish_inbound_conn( - &mut self, - addr: SocketAddr, - pub_key: TransportPublicKey, - hops_to_live: Option, - ) { - let id = Transaction::new::(); - let target_peer_id = PeerId::new(addr, pub_key.clone()); - let target_peer = PeerKeyLocation::from(target_peer_id); - let hops_to_live = hops_to_live.unwrap_or(10); - let initial_join_req = ConnectMsg::Request { - id, - target: target_peer, - msg: ConnectRequest::StartJoinReq { - joiner: None, - joiner_key: pub_key, - joiner_location: None, - hops_to_live, - max_hops_to_live: hops_to_live, - skip_connections: HashSet::new(), - skip_forwards: HashSet::new(), - }, - }; - self.inbound_msg( - addr, - NetMessage::V1(NetMessageV1::Connect(initial_join_req)), - ) - .await - } - - async fn inbound_msg(&mut self, addr: SocketAddr, msg: impl Serialize + Display) { - tracing::debug!(at=?self.my_addr, to=%addr, "Sending message from peer"); - let msg = bincode::serialize(&msg).unwrap(); - let (out_symm_key, packet_sender) = self.packet_senders.get_mut(&addr).unwrap(); - let sym_msg = SymmetricMessage::serialize_msg_to_packet_data( - self.packet_id, - msg, - out_symm_key, - vec![], - ) - .unwrap(); - tracing::trace!(at=?self.my_addr, to=%addr, "Sending message to peer"); - packet_sender.send(sym_msg.into_unknown()).await.unwrap(); - tracing::trace!(at=?self.my_addr, to=%addr, "Message sent"); - self.packet_id += 1; - } - - async fn recv_outbound_msg(&mut self) -> anyhow::Result { - let receiver = &mut self.packet_receivers[0]; - let (_, msg) = receiver - .recv() - .await - .ok_or_else(|| anyhow::Error::msg("Failed to receive packet"))?; - let packet: PacketData = PacketData::from_buf(&*msg); - let packet = packet - .try_decrypt_sym(&self.in_key) - .map_err(|_| anyhow!("Failed to decrypt packet"))?; - let msg: SymmetricMessage = bincode::deserialize(packet.data()).unwrap(); - let payload = match msg { - SymmetricMessage { - payload: SymmetricMessagePayload::ShortMessage { payload }, - .. - } => payload, - SymmetricMessage { - payload: - SymmetricMessagePayload::StreamFragment { - total_length_bytes, - mut payload, - .. - }, - .. - } => { - let mut remaining = total_length_bytes as usize - payload.len(); - while remaining > 0 { - let (_, msg) = receiver - .recv() - .await - .ok_or_else(|| anyhow::Error::msg("Failed to receive packet"))?; - let packet: PacketData = PacketData::from_buf(&*msg); - let packet = packet - .try_decrypt_sym(&self.in_key) - .map_err(|_| anyhow!("Failed to decrypt packet"))?; - let msg: SymmetricMessage = bincode::deserialize(packet.data()).unwrap(); - match msg { - SymmetricMessage { - payload: SymmetricMessagePayload::StreamFragment { payload: new, .. }, - .. - } => { - payload.extend_from_slice(&new); - remaining -= new.len(); - } - _ => panic!("Unexpected message type"), - } - } - payload - } - _ => panic!("Unexpected message type"), - }; - let msg: NetMessage = bincode::deserialize(&payload).unwrap(); - Ok(msg) - } -} - -struct NodeMock { - establish_conn: HanshakeHandlerMsg, - _outbound_msg: OutboundMessage, -} - -impl NodeMock { - /// A request from node internals to establish a connection with a peer. - async fn establish_conn(&self, remote: PeerId, tx: Transaction, is_gw: bool) { - self.establish_conn - .establish_conn(remote, tx, is_gw) - .await - .unwrap(); - } -} - -struct TestVerifier { - transport: TransportMock, - node: NodeMock, -} - -fn config_handler( - addr: impl Into, - existing_connections: Option>, - is_gateway: bool, -) -> (HandshakeHandler, TestVerifier) { - let (outbound_sender, outbound_recv) = mpsc::channel(100); - let outbound_conn_handler = OutboundConnectionHandler::new(outbound_sender); - let (inbound_sender, inbound_recv) = mpsc::channel(100); - let inbound_conn_handler = InboundConnectionHandler::new(inbound_recv); - let addr = addr.into(); - let keypair = TransportKeypair::new(); - let mngr = ConnectionManager::default_with_key(keypair.public().clone()); - mngr.try_set_peer_key(addr); - let router = Router::new(&[]); - - if let Some(connections) = existing_connections { - for conn in connections { - let location = conn.get_location().location.unwrap(); - let peer_id = conn.get_location().peer.clone(); - mngr.add_connection(location, peer_id, false); - } - } - - let (handler, establish_conn, _outbound_msg) = HandshakeHandler::new( - inbound_conn_handler, - outbound_conn_handler, - mngr, - Arc::new(RwLock::new(router)), - None, - is_gateway, - None, // test code doesn't need peer_ready - ); - ( - handler, - TestVerifier { - transport: TransportMock { - inbound_sender, - outbound_recv, - packet_senders: HashMap::new(), - packet_receivers: Vec::new(), - in_key: Aes128Gcm::new_from_slice(&[0; 16]).unwrap(), - packet_id: 0, - my_addr: addr, - }, - node: NodeMock { - establish_conn, - _outbound_msg, - }, - }, - ) -} - -async fn start_conn( - test: &mut TestVerifier, - addr: SocketAddr, - pub_key: TransportPublicKey, - id: Transaction, - is_gw: bool, -) -> oneshot::Sender> { - test.node - .establish_conn(PeerId::new(addr, pub_key.clone()), id, is_gw) - .await; - let ( - trying_addr, - ConnectionEvent::ConnectionStart { - remote_public_key, - open_connection, - }, - ) = test - .transport - .outbound_recv - .recv() - .await - .ok_or_else(|| anyhow!("failed to get conn start req")) - .unwrap(); - assert_eq!(trying_addr, addr); - assert_eq!(remote_public_key, pub_key); - tracing::debug!("Received connection event"); - open_connection -} - -// ============================================================================ -// Stream-based tests for HandshakeEventStream -// ============================================================================ - -/// Helper to get the next event from a HandshakeEventStream -async fn next_stream_event(stream: &mut HandshakeEventStream) -> Result { - use futures::StreamExt; - stream.next().await.ok_or(HandshakeError::ChannelClosed)? -} - -#[tokio::test] -async fn test_stream_gateway_inbound_conn_success() -> anyhow::Result<()> { - let addr: SocketAddr = ([127, 0, 0, 1], 10000).into(); - let (handler, mut test) = config_handler(addr, None, true); - let mut stream = HandshakeEventStream::new(handler); - - let remote_addr = ([127, 0, 0, 1], 10001).into(); - let test_controller = async { - let pub_key = TransportKeypair::new().public().clone(); - test.transport.new_conn(remote_addr).await; - test.transport - .establish_inbound_conn(remote_addr, pub_key, None) - .await; - Ok::<_, anyhow::Error>(()) - }; - - let gw_inbound = async { - let event = - tokio::time::timeout(Duration::from_secs(15), next_stream_event(&mut stream)).await??; - match event { - Event::InboundConnection { conn, .. } => { - assert_eq!(conn.remote_addr(), remote_addr); - Ok(()) - } - other => bail!("Unexpected event: {:?}", other), - } - }; - futures::try_join!(test_controller, gw_inbound)?; - Ok(()) -} - -#[tokio::test] -async fn test_stream_gateway_inbound_conn_rejected() -> anyhow::Result<()> { - let addr: SocketAddr = ([127, 0, 0, 1], 10000).into(); - let (handler, mut test) = config_handler(addr, None, true); - let mut stream = HandshakeEventStream::new(handler); - - let remote_addr = ([127, 0, 0, 1], 10001).into(); - let remote_pub_key = TransportKeypair::new().public().clone(); - let test_controller = async { - test.transport.new_conn(remote_addr).await; - test.transport - .establish_inbound_conn(remote_addr, remote_pub_key.clone(), None) - .await; - - // Reject the connection - let sender_key = TransportKeypair::new().public().clone(); - let acceptor_key = TransportKeypair::new().public().clone(); - let joiner_key = TransportKeypair::new().public().clone(); - let response = NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - id: Transaction::new::(), - sender: PeerKeyLocation { - peer: PeerId::new(addr, sender_key), - location: Some(Location::random()), - }, - target: PeerKeyLocation { - peer: PeerId::new(remote_addr, remote_pub_key), - location: Some(Location::random()), - }, - msg: ConnectResponse::AcceptedBy { - accepted: false, - acceptor: PeerKeyLocation { - peer: PeerId::new(addr, acceptor_key), - location: Some(Location::random()), - }, - joiner: PeerId::new(remote_addr, joiner_key), - }, - })); - - test.transport.inbound_msg(remote_addr, response).await; - Ok::<_, anyhow::Error>(()) - }; - - let gw_inbound = async { - // First event: InboundConnection (may be accepted or rejected depending on routing) - let event = - tokio::time::timeout(Duration::from_secs(15), next_stream_event(&mut stream)).await??; - tracing::info!("Received event: {:?}", event); - Ok(()) - }; - futures::try_join!(test_controller, gw_inbound)?; - Ok(()) -} - -#[tokio::test] -async fn test_stream_peer_to_gw_outbound_conn() -> anyhow::Result<()> { - let addr: SocketAddr = ([127, 0, 0, 1], 10001).into(); - let (handler, mut test) = config_handler(addr, None, false); - let mut stream = HandshakeEventStream::new(handler); - - let joiner_key = TransportKeypair::new(); - let pub_key = joiner_key.public().clone(); - let id = Transaction::new::(); - let remote_addr: SocketAddr = ([127, 0, 0, 2], 10002).into(); - - let test_controller = async { - let open_connection = start_conn(&mut test, remote_addr, pub_key.clone(), id, true).await; - test.transport - .new_outbound_conn(remote_addr, open_connection) - .await; - tracing::debug!("Outbound connection established"); - - // Wait for and respond to StartJoinReq - let msg = test.transport.recv_outbound_msg().await?; - let msg = match msg { - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - id: inbound_id, - msg: ConnectRequest::StartJoinReq { joiner_key, .. }, - .. - })) => { - assert_eq!(id, inbound_id); - let sender = PeerKeyLocation { - peer: PeerId::new(remote_addr, pub_key.clone()), - location: Some(Location::from_address(&remote_addr)), - }; - let joiner_peer_id = PeerId::new(addr, joiner_key.clone()); - let target = PeerKeyLocation { - peer: joiner_peer_id.clone(), - location: Some(Location::random()), - }; - NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Response { - id: inbound_id, - sender: sender.clone(), - target, - msg: ConnectResponse::AcceptedBy { - accepted: true, - acceptor: sender, - joiner: joiner_peer_id, - }, - })) - } - other => bail!("Unexpected message: {:?}", other), - }; - test.transport.inbound_msg(remote_addr, msg).await; - Ok::<_, anyhow::Error>(()) - }; - - let peer_outbound = async { - let event = - tokio::time::timeout(Duration::from_secs(15), next_stream_event(&mut stream)).await??; - match event { - Event::OutboundGatewayConnectionSuccessful { - peer_id, - connection, - .. - } => { - assert_eq!(peer_id.addr, remote_addr); - assert_eq!(peer_id.pub_key, pub_key); - drop(connection); - Ok(()) - } - other => bail!("Unexpected event: {:?}", other), - } - }; - - futures::try_join!(test_controller, peer_outbound)?; - Ok(()) -} - -#[tokio::test] -async fn test_stream_peer_to_peer_outbound_conn_succeeded() -> anyhow::Result<()> { - let addr: SocketAddr = ([127, 0, 0, 1], 10001).into(); - let (handler, mut test) = config_handler(addr, None, false); - let mut stream = HandshakeEventStream::new(handler); - - let peer_key = TransportKeypair::new(); - let peer_pub_key = peer_key.public().clone(); - let peer_addr = ([127, 0, 0, 2], 10002).into(); - - let tx = Transaction::new::(); - - let test_controller = async { - let open_connection = - start_conn(&mut test, peer_addr, peer_pub_key.clone(), tx, false).await; - test.transport - .new_outbound_conn(peer_addr, open_connection) - .await; - - Ok::<_, anyhow::Error>(()) - }; - - let peer_inbound = async { - let event = - tokio::time::timeout(Duration::from_secs(15), next_stream_event(&mut stream)).await??; - match event { - Event::OutboundConnectionSuccessful { - peer_id, - connection, - } => { - assert_eq!(peer_id.addr, peer_addr); - assert_eq!(peer_id.pub_key, peer_pub_key); - drop(connection); - Ok(()) - } - other => bail!("Unexpected event: {:?}", other), - } - }; - - futures::try_join!(test_controller, peer_inbound)?; - Ok(()) -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_stream_peer_to_gw_outbound_conn_rejected() -> anyhow::Result<()> { - let joiner_addr = ([127, 0, 0, 1], 10001).into(); - let (handler, mut test) = config_handler(joiner_addr, None, false); - let mut stream = HandshakeEventStream::new(handler); - - let gw_key = TransportKeypair::new(); - let gw_pub_key = gw_key.public().clone(); - let gw_addr = ([127, 0, 0, 1], 10000).into(); - let gw_peer_id = PeerId::new(gw_addr, gw_pub_key.clone()); - let gw_pkloc = PeerKeyLocation { - location: Some(Location::from_address(&gw_peer_id.addr)), - peer: gw_peer_id.clone(), - }; - - let joiner_key = TransportKeypair::new(); - let joiner_pub_key = joiner_key.public().clone(); - let joiner_peer_id = PeerId::new(joiner_addr, joiner_pub_key.clone()); - let joiner_pkloc = PeerKeyLocation { - peer: joiner_peer_id.clone(), - location: Some(Location::from_address(&joiner_peer_id.addr)), - }; - - let tx = Transaction::new::(); - - let test_controller = async { - let open_connection = start_conn(&mut test, gw_addr, gw_pub_key.clone(), tx, true).await; - test.transport - .new_outbound_conn(gw_addr, open_connection) - .await; - - let msg = test.transport.recv_outbound_msg().await?; - tracing::info!("Received connect request: {:?}", msg); - let NetMessage::V1(NetMessageV1::Connect(ConnectMsg::Request { - id, - msg: ConnectRequest::StartJoinReq { .. }, - .. - })) = msg - else { - panic!("unexpected message"); - }; - assert_eq!(id, tx); - - let initial_join_req = ConnectMsg::Response { - id: tx, - sender: gw_pkloc.clone(), - target: joiner_pkloc.clone(), - msg: ConnectResponse::AcceptedBy { - accepted: false, - acceptor: gw_pkloc.clone(), - joiner: joiner_peer_id.clone(), - }, - }; - test.transport - .inbound_msg( - gw_addr, - NetMessage::V1(NetMessageV1::Connect(initial_join_req)), - ) - .await; - tracing::debug!("Sent initial gw rejected reply"); - - for i in 1..Ring::DEFAULT_MAX_HOPS_TO_LIVE { - let port = i + 10; - let addr = ([127, 0, port as u8, 1], port as u16).into(); - let acceptor = PeerKeyLocation { - location: Some(Location::from_address(&addr)), - peer: PeerId::new(addr, TransportKeypair::new().public().clone()), - }; - tracing::info!(%acceptor, "Sending forward reply number {i} with status `{}`", i > 3); - let forward_response = ConnectMsg::Response { - id: tx, - sender: gw_pkloc.clone(), - target: joiner_pkloc.clone(), - msg: ConnectResponse::AcceptedBy { - accepted: i > 3, - acceptor: acceptor.clone(), - joiner: joiner_peer_id.clone(), - }, - }; - test.transport - .inbound_msg( - gw_addr, - NetMessage::V1(NetMessageV1::Connect(forward_response.clone())), - ) - .await; - - if i > 3 { - // Create the successful connection - async fn establish_conn( - test: &mut TestVerifier, - i: usize, - joiner_addr: SocketAddr, - ) -> Result<(), anyhow::Error> { - let (remote, ev) = tokio::time::timeout( - Duration::from_secs(10), - test.transport.outbound_recv.recv(), - ) - .await - .inspect_err(|error| { - tracing::error!(%error, conn_num = %i, "failed while receiving connection events"); - }) - .map_err(|_| anyhow!("time out"))? - .ok_or( anyhow!("Failed to receive event"))?; - let ConnectionEvent::ConnectionStart { - open_connection, .. - } = ev; - let out_symm_key = Aes128Gcm::new_from_slice(&[0; 16]).unwrap(); - let in_symm_key = Aes128Gcm::new_from_slice(&[1; 16]).unwrap(); - let (conn, out, inb) = PeerConnection::new_remote_test( - remote, - joiner_addr, - out_symm_key, - in_symm_key.clone(), - ); - test.transport - .packet_senders - .insert(remote, (in_symm_key, out)); - test.transport.packet_receivers.push(inb); - tracing::info!(conn_num = %i, %remote, "Connection established at remote"); - open_connection - .send(Ok(conn)) - .map_err(|_| anyhow!("failed to open conn"))?; - tracing::info!(conn_num = %i, "Returned open conn"); - Ok(()) - } - - establish_conn(&mut test, i, joiner_addr).await?; - } - } - - Ok::<_, anyhow::Error>(()) - }; - - let peer_inbound = async { - let mut conn_count = 0; - let mut gw_rejected = false; - for conn_num in 3..Ring::DEFAULT_MAX_HOPS_TO_LIVE { - let conn_num = conn_num + 2; - let event = - tokio::time::timeout(Duration::from_secs(60), next_stream_event(&mut stream)) - .await - .inspect_err(|_| { - tracing::error!(%conn_num, "failed while waiting for events"); - })? - .inspect_err(|error| { - tracing::error!(%error, %conn_num, "failed while receiving events"); - })?; - match event { - Event::OutboundConnectionSuccessful { peer_id, .. } => { - tracing::info!(%peer_id, %conn_num, "Connection established at peer"); - conn_count += 1; - } - Event::OutboundGatewayConnectionRejected { peer_id } => { - tracing::info!(%peer_id, "Gateway connection rejected"); - assert_eq!(peer_id.addr, gw_addr); - gw_rejected = true; - } - other => bail!("Unexpected event: {:?}", other), - } - } - tracing::debug!("Completed all checks, connection count: {conn_count}"); - assert!(gw_rejected); - assert_eq!(conn_count, 6); - Ok(()) - }; - futures::try_join!(test_controller, peer_inbound)?; - Ok(()) -} diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 4812db655..f1ff7af4f 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -647,15 +647,6 @@ impl P2pConnManager { ); } } - NodeEvent::SendMessage { target, msg } => { - // Send the message to the target peer over the network - tracing::debug!( - tx = %msg.id(), - %target, - "SendMessage event: sending message to peer via network bridge" - ); - ctx.bridge.send(&target, *msg).await?; - } NodeEvent::QueryConnections { callback } => { let connections = ctx.connections.keys().cloned().collect(); match timeout( diff --git a/crates/core/src/node/op_state_manager.rs b/crates/core/src/node/op_state_manager.rs index 3b597177e..23e931149 100644 --- a/crates/core/src/node/op_state_manager.rs +++ b/crates/core/src/node/op_state_manager.rs @@ -26,8 +26,7 @@ use crate::{ message::{MessageStats, NetMessage, NodeEvent, Transaction, TransactionType}, node::PeerId, operations::{ - connect::ConnectOp, get::GetOp, put::PutOp, subscribe::SubscribeOp, update::UpdateOp, - OpEnum, OpError, + get::GetOp, put::PutOp, subscribe::SubscribeOp, update::UpdateOp, OpEnum, OpError, }, ring::{ConnectionManager, LiveTransactionTracker, Ring}, }; @@ -186,7 +185,6 @@ impl SubOperationTracker { #[derive(Default)] struct Ops { - connect: DashMap, connect_v2: DashMap, put: DashMap, get: DashMap, @@ -431,11 +429,6 @@ impl OpManager { } self.new_transactions.send(id).await?; match op { - OpEnum::Connect(op) => { - #[cfg(debug_assertions)] - check_id_op!(id.transaction_type(), TransactionType::Connect); - self.ops.connect.insert(id, *op); - } OpEnum::ConnectV2(op) => { #[cfg(debug_assertions)] check_id_op!(id.transaction_type(), TransactionType::Connect); @@ -482,14 +475,7 @@ impl OpManager { .connect_v2 .remove(id) .map(|(_k, v)| v) - .map(|op| OpEnum::ConnectV2(Box::new(op))) - .or_else(|| { - self.ops - .connect - .remove(id) - .map(|(_k, v)| v) - .map(|op| OpEnum::Connect(Box::new(op))) - }), + .map(|op| OpEnum::ConnectV2(Box::new(op))), TransactionType::Put => self.ops.put.remove(id).map(|(_k, v)| v).map(OpEnum::Put), TransactionType::Get => self.ops.get.remove(id).map(|(_k, v)| v).map(OpEnum::Get), TransactionType::Subscribe => self @@ -714,7 +700,7 @@ async fn garbage_cleanup_task( continue; } let still_waiting = match tx.transaction_type() { - TransactionType::Connect => ops.connect.remove(&tx).is_none(), + TransactionType::Connect => ops.connect_v2.remove(&tx).is_none(), TransactionType::Put => ops.put.remove(&tx).is_none(), TransactionType::Get => ops.get.remove(&tx).is_none(), TransactionType::Subscribe => ops.subscribe.remove(&tx).is_none(), @@ -763,7 +749,7 @@ async fn garbage_cleanup_task( } } let removed = match tx.transaction_type() { - TransactionType::Connect => ops.connect.remove(&tx).is_some(), + TransactionType::Connect => ops.connect_v2.remove(&tx).is_some(), TransactionType::Put => ops.put.remove(&tx).is_some(), TransactionType::Get => ops.get.remove(&tx).is_some(), TransactionType::Subscribe => ops.subscribe.remove(&tx).is_some(), diff --git a/crates/core/src/node/testing_impl.rs b/crates/core/src/node/testing_impl.rs index 6987d302e..e9a48b995 100644 --- a/crates/core/src/node/testing_impl.rs +++ b/crates/core/src/node/testing_impl.rs @@ -939,11 +939,6 @@ where tracing::debug!(%peer, "ExpectPeerConnection ignored in testing impl"); continue; } - NodeEvent::SendMessage { target, msg } => { - tracing::debug!(tx = %msg.id(), %target, "SendMessage event in testing_impl"); - conn_manager.send(&target, *msg).await?; - continue; - } }, Err(err) => { super::report_result( diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index d635d5a28..ee25c5208 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -1,1114 +1,20 @@ -//! Operation which seeks new connections in the ring. use std::borrow::Borrow; use std::collections::HashSet; -use std::pin::Pin; -use std::sync::Arc; +use std::fmt::Display; -use freenet_stdlib::client_api::HostResponse; -use futures::Future; +use serde::{Deserialize, Serialize}; -pub(crate) use self::messages::{ConnectMsg, ConnectRequest, ConnectResponse}; -use super::{connect, OpError, OpInitialization, OpOutcome, Operation, OperationResult}; -use crate::client_events::HostResult; use crate::dev_tool::Location; -use crate::message::{NetMessageV1, NodeEvent}; -use crate::node::IsOperationCompleted; -use crate::ring::ConnectionManager; -use crate::router::Router; +use crate::message::{InnerMessage, Transaction}; +use crate::node::PeerId; +use crate::ring::PeerKeyLocation; use crate::transport::TransportPublicKey; -use crate::{ - message::{InnerMessage, NetMessage, Transaction}, - node::{NetworkBridge, OpManager, PeerId}, - operations::OpEnum, - ring::PeerKeyLocation, - util::Backoff, -}; -#[derive(Debug)] -pub(crate) struct ConnectOp { - id: Transaction, - pub(crate) state: Option, - pub gateway: Option>, - /// keeps track of the number of retries and applies an exponential backoff cooldown period - pub backoff: Option, -} - -impl ConnectOp { - pub fn new( - id: Transaction, - state: Option, - gateway: Option>, - backoff: Option, - ) -> Self { - Self { - id, - state, - gateway, - backoff, - } - } - - #[allow(dead_code)] - pub fn has_backoff(&self) -> bool { - self.backoff.is_some() - } - - pub(super) fn outcome(&self) -> OpOutcome<'_> { - OpOutcome::Irrelevant - } - - pub(super) fn finalized(&self) -> bool { - matches!(self.state, Some(ConnectState::Connected)) - } - - pub(super) fn to_host_result(&self) -> HostResult { - // this shouldn't ever be called since clients can't request explicit connects - Ok(HostResponse::Ok) - } -} - -impl IsOperationCompleted for ConnectOp { - fn is_completed(&self) -> bool { - matches!(self.state, Some(connect::ConnectState::Connected)) - } -} - -/// Not really used since client requests will never interact with this directly. -pub(crate) struct ConnectResult {} - -impl TryFrom for ConnectResult { - type Error = OpError; - - fn try_from(_value: ConnectOp) -> Result { - Ok(Self {}) - } -} - -impl Operation for ConnectOp { - type Message = ConnectMsg; - type Result = ConnectResult; - - async fn load_or_init<'a>( - op_manager: &'a OpManager, - msg: &'a Self::Message, - ) -> Result, OpError> { - let sender; - let tx = *msg.id(); - match op_manager.pop(msg.id()) { - Ok(Some(OpEnum::Connect(connect_op))) => { - sender = msg.sender().cloned(); - // was an existing operation, the other peer messaged back - Ok(OpInitialization { - op: *connect_op, - sender, - }) - } - Ok(Some(op)) => { - let _ = op_manager.push(tx, op).await; - Err(OpError::OpNotPresent(tx)) - } - Ok(None) => { - let gateway = if !matches!( - msg, - ConnectMsg::Request { - msg: ConnectRequest::FindOptimalPeer { .. }, - .. - } - ) { - Some(Box::new(op_manager.ring.connection_manager.own_location())) - } else { - None - }; - // new request to join this node, initialize the state - Ok(OpInitialization { - op: Self { - id: tx, - state: Some(ConnectState::Initializing), - backoff: None, - gateway, - }, - sender: None, - }) - } - Err(err) => { - #[cfg(debug_assertions)] - if matches!(err, crate::node::OpNotAvailable::Completed) { - let target = msg.target(); - let target = target.as_ref().map(|b| b.borrow()); - tracing::warn!(%tx, peer = ?target, "filtered"); - } - Err(err.into()) - } - } - } - - fn id(&self) -> &Transaction { - &self.id - } - - fn process_message<'a, NB: NetworkBridge>( - mut self, - network_bridge: &'a mut NB, - op_manager: &'a OpManager, - input: &'a Self::Message, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - let return_msg; - let new_state; - - match input { - ConnectMsg::Request { - msg: - ConnectRequest::FindOptimalPeer { - query_target, - ideal_location, - joiner, - max_hops_to_live, - skip_connections, - skip_forwards, - }, - id, - .. - } => { - let ring_max_htl = op_manager.ring.max_hops_to_live.max(1); - let mut max_hops = (*max_hops_to_live).min(ring_max_htl); - if max_hops == 0 { - max_hops = 1; - } - let own_loc = op_manager.ring.connection_manager.own_location(); - let PeerKeyLocation { - peer: this_peer, - location: Some(_), - } = &own_loc - else { - return Err(OpError::RingError(crate::ring::RingError::NoLocation)); - }; - let mut skip_connections = skip_connections.clone(); - let mut skip_forwards = skip_forwards.clone(); - skip_connections.extend([ - this_peer.clone(), - query_target.peer.clone(), - joiner.peer.clone(), - ]); - skip_forwards.extend([this_peer.clone(), query_target.peer.clone()]); - if this_peer == &query_target.peer { - // this peer should be the original target queries - tracing::info!( - tx = %id, - query_target = %query_target.peer, - joiner = %joiner.peer, - skip_connections_count = skip_connections.len(), - "Gateway received FindOptimalPeer request from joiner", - ); - // Use the full skip_connections set to avoid recommending peers - // that the joiner is already connected to (including the gateway itself) - if let Some(desirable_peer) = op_manager.ring.closest_to_location( - *ideal_location, - skip_connections.iter().cloned().collect(), - ) { - tracing::info!( - tx = %id, - query_target = %query_target.peer, - joiner = %joiner.peer, - desirable_peer = %desirable_peer.peer, - "Gateway found desirable peer, forwarding to joiner", - ); - let msg = create_forward_message( - *id, - &own_loc, - joiner, - &desirable_peer, - max_hops, - max_hops, - skip_connections, - skip_forwards, - ); - network_bridge.send(&desirable_peer.peer, msg).await?; - return_msg = None; - new_state = Some(ConnectState::AwaitingConnectionAcquisition {}); - } else { - tracing::warn!( - tx = %id, - query_target = %query_target.peer, - joiner = %joiner.peer, - "Gateway found no suitable peers to forward CheckConnectivity request", - ); - // Send a negative response back to the joiner to inform them - // that no suitable peers are currently available - let response = ConnectResponse::AcceptedBy { - accepted: false, - acceptor: own_loc.clone(), - joiner: joiner.peer.clone(), - }; - return_msg = Some(ConnectMsg::Response { - id: *id, - sender: own_loc.clone(), - target: joiner.clone(), - msg: response, - }); - new_state = None; - } - } else { - // this peer is the one establishing connections - tracing::debug!( - tx = %id, - query_target = %query_target.peer, - this_peer = %joiner.peer, - "Querying the query target for new connections", - ); - debug_assert_eq!(this_peer, &joiner.peer); - new_state = Some(ConnectState::AwaitingNewConnection(NewConnectionInfo { - remaining_connections: max_hops, - })); - let msg = ConnectMsg::Request { - id: *id, - target: query_target.clone(), - msg: ConnectRequest::FindOptimalPeer { - query_target: query_target.clone(), - ideal_location: *ideal_location, - joiner: joiner.clone(), - max_hops_to_live: max_hops, - skip_connections, - skip_forwards, - }, - }; - network_bridge.send(&query_target.peer, msg.into()).await?; - return_msg = None; - } - } - ConnectMsg::Request { - id, - msg: - ConnectRequest::CheckConnectivity { - sender, - joiner, - hops_to_live, - max_hops_to_live, - skip_connections, - skip_forwards, - .. - }, - .. - } => { - let this_peer = op_manager.ring.connection_manager.own_location(); - let ring_max_htl = op_manager.ring.max_hops_to_live.max(1); - let mut max_htl = (*max_hops_to_live).min(ring_max_htl); - if max_htl == 0 { - max_htl = 1; - } - let mut hops_left = (*hops_to_live).min(max_htl); - if hops_left == 0 { - tracing::warn!( - tx = %id, - sender = %sender.peer, - joiner = %joiner.peer, - "Received CheckConnectivity with zero hops to live; clamping to 1" - ); - hops_left = 1; - } - if sender.peer == joiner.peer { - tracing::error!( - tx = %id, - sender = %sender.peer, - joiner = %joiner.peer, - at = %this_peer.peer, - "Connectivity check from self (sender == joiner), rejecting operation" - ); - return Err(OpError::UnexpectedOpState); - } - if this_peer.peer == joiner.peer { - tracing::error!( - tx = %id, - this_peer = %this_peer.peer, - joiner = %joiner.peer, - sender = %sender.peer, - "Received CheckConnectivity where this peer is the joiner (self-connection attempt), rejecting operation" - ); - return Err(OpError::UnexpectedOpState); - } - let joiner_loc = joiner - .location - .expect("should be already set at the p2p bridge level"); - - tracing::debug!( - tx = %id, - at = %this_peer.peer, - hops_to_live = %hops_left, - joiner = %joiner, - "Checking connectivity request received" - ); - - let requested_accept = op_manager - .ring - .connection_manager - .should_accept(joiner_loc, &joiner.peer); - let acceptance_status = if requested_accept { - tracing::info!(tx = %id, %joiner, "CheckConnectivity: Accepting connection from, will trigger ConnectPeer"); - // Ensure the transport layer is ready for the incoming handshake before we notify upstream. - op_manager - .notify_node_event(NodeEvent::ExpectPeerConnection { - peer: joiner.peer.clone(), - }) - .await?; - if sender.peer != this_peer.peer { - let accept_msg = ConnectMsg::Response { - id: *id, - sender: this_peer.clone(), - target: sender.clone(), - msg: ConnectResponse::AcceptedBy { - accepted: true, - acceptor: this_peer.clone(), - joiner: joiner.peer.clone(), - }, - }; - op_manager - .notify_node_event(NodeEvent::SendMessage { - target: sender.peer.clone(), - msg: Box::new(NetMessage::from(accept_msg)), - }) - .await?; - } - let (callback, mut result) = tokio::sync::mpsc::channel(10); - // Attempt to connect to the joiner - op_manager - .notify_node_event(NodeEvent::ConnectPeer { - peer: joiner.peer.clone(), - tx: *id, - callback, - is_gw: false, - }) - .await?; - let mut status = true; - match result.recv().await.ok_or(OpError::NotificationError)? { - Ok((peer_id, remaining_checks)) => { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - connected_peer = %peer_id, - remaining_checks, - "ConnectPeer completed successfully" - ); - let was_reserved = true; // reserved just above in call to should_accept - op_manager - .ring - .add_connection(joiner_loc, joiner.peer.clone(), was_reserved) - .await; - } - Err(()) => { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - "ConnectPeer failed to establish connection" - ); - op_manager - .ring - .connection_manager - .prune_in_transit_connection(&joiner.peer); - status = false; - if sender.peer != this_peer.peer { - let decline_msg = ConnectMsg::Response { - id: *id, - sender: this_peer.clone(), - target: sender.clone(), - msg: ConnectResponse::AcceptedBy { - accepted: false, - acceptor: this_peer.clone(), - joiner: joiner.peer.clone(), - }, - }; - op_manager - .notify_node_event(NodeEvent::SendMessage { - target: sender.peer.clone(), - msg: Box::new(NetMessage::from(decline_msg)), - }) - .await?; - } - } - } - status - } else { - tracing::debug!(tx = %id, at = %this_peer.peer, from = %joiner, "Rejecting connection"); - if sender.peer != this_peer.peer { - let decline_msg = ConnectMsg::Response { - id: *id, - sender: this_peer.clone(), - target: sender.clone(), - msg: ConnectResponse::AcceptedBy { - accepted: false, - acceptor: this_peer.clone(), - joiner: joiner.peer.clone(), - }, - }; - op_manager - .notify_node_event(NodeEvent::SendMessage { - target: sender.peer.clone(), - msg: Box::new(NetMessage::from(decline_msg)), - }) - .await?; - } - false - }; - - { - let mut new_skip_list = skip_connections.clone(); - new_skip_list.insert(this_peer.peer.clone()); - if let Some(updated_state) = forward_conn( - *id, - &op_manager.ring.connection_manager, - op_manager.ring.router.clone(), - network_bridge, - ForwardParams { - left_htl: hops_left, - max_htl, - accepted: requested_accept, - skip_connections: skip_connections.clone(), - skip_forwards: skip_forwards.clone(), - req_peer: sender.clone(), - joiner: joiner.clone(), - is_gateway: op_manager.ring.is_gateway, - }, - ) - .await? - { - new_state = Some(updated_state); - } else { - new_state = None - } - } - - let response_msg = ConnectMsg::Response { - id: *id, - sender: this_peer.clone(), - target: sender.clone(), - msg: ConnectResponse::AcceptedBy { - accepted: acceptance_status, - acceptor: this_peer.clone(), - joiner: joiner.peer.clone(), - }, - }; - return_msg = Some(response_msg); - } - ConnectMsg::Response { - id, - sender, - target, - msg: - ConnectResponse::AcceptedBy { - accepted, - acceptor, - joiner, - }, - } => { - tracing::debug!( - tx = %id, - at = %target.peer, - from = %sender.peer, - "Connect response received", - ); - - let this_peer_id = op_manager - .ring - .connection_manager - .get_peer_key() - .expect("peer id not found"); - - match self.state.as_mut() { - Some(ConnectState::ConnectingToNode(info)) => { - assert!(info.remaining_connections > 0); - let remaining_connections = - info.remaining_connections.saturating_sub(1); - - if *accepted { - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - connected_to = %acceptor.peer, - "Open connection acknowledged at requesting joiner peer", - ); - if acceptor.peer != this_peer_id { - // Ensure inbound handshake packets from the acceptor aren't dropped. - op_manager - .notify_node_event(NodeEvent::ExpectPeerConnection { - peer: acceptor.peer.clone(), - }) - .await?; - } - tracing::info!( - tx = %id, - joiner = %this_peer_id, - acceptor = %acceptor.peer, - location = ?acceptor.location, - "Connect response accepted; registering connection" - ); - info.accepted_by.insert(acceptor.clone()); - op_manager - .ring - .add_connection( - acceptor.location.expect("location not found"), - acceptor.peer.clone(), - true, - ) - .await; - } else { - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - rejected_peer = %acceptor.peer, - "Connection rejected", - ); - tracing::info!( - tx = %id, - joiner = %this_peer_id, - rejector = %acceptor.peer, - "Connect response rejected by peer" - ); - } - - let your_location: Location = - target.location.expect("location not found"); - tracing::debug!( - tx = %id, - at = %this_peer_id, - location = %your_location, - "Updating assigned location" - ); - op_manager - .ring - .connection_manager - .update_location(target.location); - tracing::info!( - tx = %id, - at = %this_peer_id, - new_location = ?target.location, - "Updated joiner location from connect response" - ); - - if remaining_connections == 0 { - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - "All available connections established", - ); - - try_clean_gw_connection(*id, network_bridge, info, target.clone()) - .await?; - - new_state = Some(ConnectState::Connected); - } else { - new_state = Some(ConnectState::ConnectingToNode(info.clone())); - } - return_msg = None; - } - Some(ConnectState::AwaitingConnectivity(ConnectivityInfo { - remaining_checks, - requester, - .. - })) => { - assert!(*remaining_checks > 0); - let remaining_checks = remaining_checks.saturating_sub(1); - - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - acceptor = %acceptor.peer, - accepted = %accepted, - "Connectivity check", - ); - - if remaining_checks == 0 { - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - "All connectivity checks done", - ); - new_state = None; - } else { - new_state = Some(ConnectState::AwaitingConnectivity( - ConnectivityInfo::new(requester.clone(), remaining_checks), - )); - } - let response = ConnectResponse::AcceptedBy { - accepted: *accepted, - acceptor: acceptor.clone(), - joiner: joiner.clone(), - }; - return_msg = Some(ConnectMsg::Response { - id: *id, - sender: target.clone(), - msg: response, - target: requester.clone(), - }); - } - Some(ConnectState::AwaitingNewConnection(info)) => { - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - "Connection request forwarded", - ); - assert!(info.remaining_connections > 0); - let remaining_connections = - info.remaining_connections.saturating_sub(1); - - if *accepted && *joiner == this_peer_id && acceptor.peer != this_peer_id - { - tracing::debug!( - tx = %id, - at = %this_peer_id, - acceptor = %acceptor.peer, - "Forward path accepted connection; registering inbound expectation" - ); - op_manager - .notify_node_event(NodeEvent::ExpectPeerConnection { - peer: acceptor.peer.clone(), - }) - .await?; - } - - if remaining_connections == 0 { - tracing::debug!( - tx = %id, - at = %this_peer_id, - from = %sender.peer, - "All available connections established", - ); - op_manager - .ring - .live_tx_tracker - .missing_candidate_peers(sender.peer.clone()) - .await; - new_state = None; - } else { - new_state = - Some(ConnectState::AwaitingNewConnection(NewConnectionInfo { - remaining_connections, - })); - } - - return_msg = None; - } - _ => { - tracing::debug!( - tx = %id, - peer = %this_peer_id, - "Failed to establish any connections, aborting" - ); - let op = ConnectOp { - id: *id, - state: None, - gateway: self.gateway, - backoff: self.backoff, - }; - op_manager - .notify_op_change( - NetMessage::V1(NetMessageV1::Aborted(*id)), - OpEnum::Connect(op.into()), - ) - .await?; - return Err(OpError::StatePushed); - } - } - } - _ => return Err(OpError::UnexpectedOpState), - } - - build_op_result(self.id, new_state, return_msg, self.gateway, self.backoff) - }) - } -} - -fn build_op_result( - id: Transaction, - state: Option, - msg: Option, - gateway: Option>, - backoff: Option, -) -> Result { - tracing::debug!(tx = %id, ?msg, "Connect operation result"); - Ok(OperationResult { - return_msg: msg.map(NetMessage::from), - state: state.map(|state| { - OpEnum::Connect(Box::new(ConnectOp { - id, - state: Some(state), - gateway, - backoff, - })) - }), - }) -} - -async fn try_clean_gw_connection( - id: Transaction, - conn_bridge: &mut NB, - state: &mut ConnectionInfo, - joiner: PeerKeyLocation, -) -> Result<(), OpError> -where - NB: NetworkBridge, -{ - let need_to_clean_gw_conn = state - .accepted_by - .iter() - .all(|pkloc| pkloc.peer != state.gateway.peer); - - if need_to_clean_gw_conn { - let msg = ConnectMsg::Request { - id, - target: state.gateway.clone(), - msg: ConnectRequest::CleanConnection { joiner }, - }; - conn_bridge.send(&state.gateway.peer, msg.into()).await?; - } - Ok(()) -} - -type Requester = PeerKeyLocation; - -#[derive(Debug)] -pub enum ConnectState { - Initializing, - #[allow(dead_code)] - ConnectingToNode(ConnectionInfo), - AwaitingConnectivity(ConnectivityInfo), - AwaitingConnectionAcquisition, - AwaitingNewConnection(NewConnectionInfo), - Connected, -} - -#[derive(Debug, Clone)] -pub(crate) struct ConnectivityInfo { - remaining_checks: usize, - requester: Requester, - /// Indicates this is a gateway bootstrap acceptance that should be registered immediately. - /// See forward_conn() bootstrap logic and handshake handler for details. - pub(crate) is_bootstrap_acceptance: bool, -} - -impl ConnectivityInfo { - pub fn new(requester: Requester, remaining_checks: usize) -> Self { - Self { - requester, - remaining_checks, - is_bootstrap_acceptance: false, - } - } - - pub fn new_bootstrap(requester: Requester, remaining_checks: usize) -> Self { - Self { - requester, - remaining_checks, - is_bootstrap_acceptance: true, - } - } - - /// Decrements the remaining checks and returns whether the checks are complete. - pub fn decrement_check(&mut self) -> bool { - self.remaining_checks = self.remaining_checks.saturating_sub(1); - self.remaining_checks == 0 - } -} - -#[derive(Debug, Clone)] -pub(crate) struct ConnectionInfo { - gateway: PeerKeyLocation, - accepted_by: HashSet, - remaining_connections: usize, -} - -#[derive(Debug, Clone)] -pub(crate) struct NewConnectionInfo { - remaining_connections: usize, -} - -impl ConnectState { - #[allow(dead_code)] - fn try_unwrap_connecting(self) -> Result { - if let Self::ConnectingToNode(conn_info) = self { - Ok(conn_info) - } else { - Err(OpError::UnexpectedOpState) - } - } -} - -/// # Arguments -/// -/// - gateways: Inmutable list of known gateways. Passed when starting up the node. -/// After the initial connections through the gateways are established all other connections -/// (to gateways or regular peers) will be treated as regular connections. -pub(crate) struct ForwardParams { - pub left_htl: usize, - pub max_htl: usize, - pub accepted: bool, - /// Avoid connecting to these peers. - pub skip_connections: HashSet, - /// Avoid forwarding to these peers. - pub skip_forwards: HashSet, - pub req_peer: PeerKeyLocation, - pub joiner: PeerKeyLocation, - /// Whether this node is a gateway - pub is_gateway: bool, -} - -pub(crate) async fn forward_conn( - id: Transaction, - connection_manager: &ConnectionManager, - router: Arc>, - network_bridge: &mut NB, - params: ForwardParams, -) -> Result, OpError> -where - NB: NetworkBridge, -{ - let ForwardParams { - left_htl, - max_htl, - accepted, - mut skip_connections, - mut skip_forwards, - req_peer, - joiner, - is_gateway, - } = params; - if left_htl == 0 { - tracing::debug!( - tx = %id, - joiner = %joiner.peer, - "Couldn't forward connect petition, no hops left", - ); - return Ok(None); - } - - let num_connections = connection_manager.num_connections(); - let num_reserved = connection_manager.get_reserved_connections(); - tracing::info!( - tx = %id, - joiner = %joiner.peer, - num_connections = %num_connections, - num_reserved = %num_reserved, - is_gateway = %is_gateway, - accepted = %accepted, - skip_connections_count = %skip_connections.len(), - skip_forwards_count = %skip_forwards.len(), - "forward_conn: checking connection forwarding", - ); - - // Bootstrap: gateway has no neighbours yet, so we keep the courtesy link and stop here. - if is_gateway && accepted && num_connections == 0 { - if num_reserved != 1 { - tracing::debug!( - tx = %id, - joiner = %joiner.peer, - num_reserved, - "Gateway bootstrap registration proceeding despite reserved count" - ); - } - tracing::info!( - tx = %id, - joiner = %joiner.peer, - "Gateway bootstrap: accepting first neighbour directly" - ); - let connectivity_info = ConnectivityInfo::new_bootstrap(joiner.clone(), 1); - return Ok(Some(ConnectState::AwaitingConnectivity(connectivity_info))); - } - - if num_connections == 0 { - tracing::debug!( - tx = %id, - joiner = %joiner.peer, - is_gateway = %is_gateway, - num_reserved = %num_reserved, - "Cannot forward or accept: no existing connections, or reserved connections pending", - ); - return Ok(None); - } - - // Try to forward the connection request to an existing peer - if num_connections > 0 { - let target_peer = { - let router = router.read(); - select_forward_target( - id, - connection_manager, - &router, - &req_peer, - &joiner, - left_htl, - &skip_forwards, - ) - }; - - skip_connections.insert(req_peer.peer.clone()); - skip_forwards.insert(req_peer.peer.clone()); - - match target_peer { - Some(target_peer) => { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - next_hop = %target_peer.peer, - htl = left_htl, - "forward_conn: forwarding connection request to peer candidate" - ); - // Successfully found a peer to forward to - let forward_msg = create_forward_message( - id, - &req_peer, - &joiner, - &target_peer, - left_htl, - max_htl, - skip_connections, - skip_forwards, - ); - tracing::debug!( - target: "network", - tx = %id, - "Forwarding connection request to {:?}", - target_peer - ); - network_bridge.send(&target_peer.peer, forward_msg).await?; - let forwarded_state = update_state_with_forward_info(&req_peer, left_htl)?; - return Ok(forwarded_state); - } - None => { - // Couldn't find suitable peer to forward to - tracing::info!( - tx = %id, - joiner = %joiner.peer, - skip_count = skip_forwards.len(), - connections = num_connections, - accepted_flag = %accepted, - "forward_conn: no suitable peer found for forwarding despite available connections" - ); - return Ok(None); - } - } - } - - // Should be unreachable - we either forwarded or returned None - unreachable!("forward_conn should have returned by now") -} - -fn select_forward_target( - id: Transaction, - connection_manager: &ConnectionManager, - router: &Router, - request_peer: &PeerKeyLocation, - joiner: &PeerKeyLocation, - left_htl: usize, - skip_forwards: &HashSet, -) -> Option { - // Create an extended skip list that includes the joiner to prevent forwarding to the joiner - let mut extended_skip = skip_forwards.clone(); - extended_skip.insert(joiner.peer.clone()); - if let Some(self_peer) = connection_manager.get_peer_key() { - extended_skip.insert(self_peer); - } - - if left_htl >= connection_manager.rnd_if_htl_above { - tracing::debug!( - tx = %id, - joiner = %joiner.peer, - "Randomly selecting peer to forward connect request", - ); - let candidate = connection_manager.random_peer(|p| !extended_skip.contains(p)); - if candidate.is_none() { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - skip = ?extended_skip, - "select_forward_target: random selection found no candidate" - ); - } else if let Some(ref c) = candidate { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - next_hop = %c.peer, - "select_forward_target: random candidate selected" - ); - } - candidate - } else { - tracing::debug!( - tx = %id, - joiner = %joiner.peer, - "Selecting close peer to forward request", - ); - let candidate = connection_manager - .routing( - joiner.location.unwrap(), - Some(&request_peer.peer), - &extended_skip, - router, - ) - .and_then(|pkl| (pkl.peer != joiner.peer).then_some(pkl)); - if candidate.is_none() { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - skip = ?extended_skip, - "select_forward_target: router returned no candidate" - ); - } else if let Some(ref c) = candidate { - tracing::info!( - tx = %id, - joiner = %joiner.peer, - next_hop = %c.peer, - "select_forward_target: routing candidate selected" - ); - } - candidate - } -} - -#[allow(clippy::too_many_arguments)] -fn create_forward_message( - id: Transaction, - request_peer: &PeerKeyLocation, - joiner: &PeerKeyLocation, - target: &PeerKeyLocation, - hops_to_live: usize, - max_hops_to_live: usize, - skip_connections: HashSet, - skip_forwards: HashSet, -) -> NetMessage { - NetMessage::from(ConnectMsg::Request { - id, - target: target.clone(), - msg: ConnectRequest::CheckConnectivity { - sender: request_peer.clone(), - joiner: joiner.clone(), - hops_to_live: hops_to_live.saturating_sub(1), // decrement the hops to live for the next hop - max_hops_to_live, - skip_connections, - skip_forwards, - }, - }) -} - -fn update_state_with_forward_info( - requester: &PeerKeyLocation, - left_htl: usize, -) -> Result, OpError> { - let connecivity_info = ConnectivityInfo::new(requester.clone(), left_htl); - let new_state = ConnectState::AwaitingConnectivity(connecivity_info); - Ok(Some(new_state)) -} +pub(crate) use self::messages::{ConnectMsg, ConnectResponse}; mod messages { - use std::fmt::Display; - use super::*; - use serde::{Deserialize, Serialize}; - #[derive(Debug, Serialize, Deserialize, Clone)] pub(crate) enum ConnectMsg { Request { @@ -1138,6 +44,7 @@ mod messages { } } + #[allow(refining_impl_trait)] fn target(&self) -> Option> { use ConnectMsg::*; match self { @@ -1152,17 +59,6 @@ mod messages { } } - impl ConnectMsg { - pub fn sender(&self) -> Option<&PeerId> { - use ConnectMsg::*; - match self { - Response { sender, .. } => Some(&sender.peer), - Connected { sender, .. } => Some(&sender.peer), - Request { .. } => None, - } - } - } - impl Display for ConnectMsg { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let id = self.id(); @@ -1174,11 +70,12 @@ mod messages { } => write!(f, "StartRequest(id: {id}, target: {target})"), Self::Request { target, - msg: ConnectRequest::CheckConnectivity { - sender, - joiner, - .. - }, + msg: + ConnectRequest::CheckConnectivity { + sender, + joiner, + .. + }, .. } => write!( f, @@ -1196,7 +93,9 @@ mod messages { "AcceptedBy(id: {id}, target: {target}, accepted: {accepted}, acceptor: {acceptor})" ), Self::Connected { .. } => write!(f, "Connected(id: {id})"), - ConnectMsg::Request { id, target, .. } => write!(f, "Request(id: {id}, target: {target})"), + ConnectMsg::Request { id, target, .. } => { + write!(f, "Request(id: {id}, target: {target})") + } } } } @@ -1205,24 +104,21 @@ mod messages { pub(crate) enum ConnectRequest { /// A request to join a gateway. StartJoinReq { - // The peer who is trying to join, should be set when PeerConnection is established + /// The peer attempting to join (set when the PeerConnection is established). joiner: Option, joiner_key: TransportPublicKey, - /// Used for deterministic testing purposes. In production, this should be none and will be ignored - /// by the gateway. + /// Used for deterministic testing purposes. Ignored in production. joiner_location: Option, hops_to_live: usize, max_hops_to_live: usize, - // Peers we don't want to connect to directly + /// Peers we don't want to connect to directly. skip_connections: HashSet, - // Peers we don't want to forward connectivity messages to (to avoid loops) + /// Peers we don't want to forward connectivity messages to (avoid loops). skip_forwards: HashSet, }, /// Query target should find a good candidate for joiner to join. FindOptimalPeer { - /// Peer whom you are querying new connection about. query_target: PeerKeyLocation, - /// The ideal location of the peer to which you would connect. ideal_location: Location, joiner: PeerKeyLocation, max_hops_to_live: usize, diff --git a/crates/core/src/operations/connect_v2.rs b/crates/core/src/operations/connect_v2.rs index 75f39c87b..b2c1fd7c4 100644 --- a/crates/core/src/operations/connect_v2.rs +++ b/crates/core/src/operations/connect_v2.rs @@ -764,7 +764,7 @@ fn store_operation_state_with_msg( state: Some(state), gateway: op.gateway.clone(), backoff: op.backoff.clone(), - desired_location: op.desired_location.clone(), + desired_location: op.desired_location, })) }), } diff --git a/crates/core/src/operations/mod.rs b/crates/core/src/operations/mod.rs index b2bf7e70f..70f483af8 100644 --- a/crates/core/src/operations/mod.rs +++ b/crates/core/src/operations/mod.rs @@ -201,7 +201,6 @@ where } pub(crate) enum OpEnum { - Connect(Box), ConnectV2(Box), Put(put::PutOp), Get(get::GetOp), @@ -212,7 +211,6 @@ pub(crate) enum OpEnum { impl OpEnum { delegate::delegate! { to match self { - OpEnum::Connect(op) => op, OpEnum::ConnectV2(op) => op, OpEnum::Put(op) => op, OpEnum::Get(op) => op, diff --git a/crates/core/src/ring/connection.rs b/crates/core/src/ring/connection.rs index 7b017b7d8..2629886d0 100644 --- a/crates/core/src/ring/connection.rs +++ b/crates/core/src/ring/connection.rs @@ -6,10 +6,3 @@ pub struct Connection { pub(crate) location: PeerKeyLocation, pub(crate) open_at: Instant, } - -#[cfg(test)] -impl Connection { - pub fn get_location(&self) -> &PeerKeyLocation { - &self.location - } -} diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 242e12972..b17939a73 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -1,6 +1,6 @@ use parking_lot::Mutex; use rand::prelude::IndexedRandom; -use std::collections::btree_map::Entry; +use std::collections::{btree_map::Entry, BTreeMap}; use crate::topology::{Limits, TopologyManager}; @@ -24,33 +24,6 @@ pub(crate) struct ConnectionManager { pub pub_key: Arc, } -#[cfg(test)] -impl ConnectionManager { - pub fn default_with_key(pub_key: TransportPublicKey) -> Self { - let min_connections = Ring::DEFAULT_MIN_CONNECTIONS; - let max_connections = Ring::DEFAULT_MAX_CONNECTIONS; - let max_upstream_bandwidth = Ring::DEFAULT_MAX_UPSTREAM_BANDWIDTH; - let max_downstream_bandwidth = Ring::DEFAULT_MAX_DOWNSTREAM_BANDWIDTH; - let rnd_if_htl_above = Ring::DEFAULT_RAND_WALK_ABOVE_HTL; - - Self::init( - max_upstream_bandwidth, - max_downstream_bandwidth, - min_connections, - max_connections, - rnd_if_htl_above, - ( - pub_key, - None, - AtomicU64::new(u64::from_le_bytes( - Location::random().as_f64().to_le_bytes(), - )), - ), - false, - ) - } -} - impl ConnectionManager { pub fn new(config: &NodeConfig) -> Self { let min_connections = if let Some(v) = config.min_number_conn { @@ -162,6 +135,7 @@ impl ConnectionManager { is_gateway = self.is_gateway, min = self.min_connections, max = self.max_connections, + rnd_if_htl_above = self.rnd_if_htl_above, "should_accept: evaluating direct acceptance guard" ); @@ -451,11 +425,6 @@ impl ConnectionManager { .load(std::sync::atomic::Ordering::SeqCst) } - pub(crate) fn get_reserved_connections(&self) -> usize { - self.reserved_connections - .load(std::sync::atomic::Ordering::SeqCst) - } - pub(super) fn get_connections_by_location(&self) -> BTreeMap> { self.connections_by_location.read().clone() } @@ -464,36 +433,6 @@ impl ConnectionManager { self.location_for_peer.read().clone() } - /// Get a random peer from the known ring connections. - pub fn random_peer(&self, filter_fn: F) -> Option - where - F: Fn(&PeerId) -> bool, - { - let peers = &*self.location_for_peer.read(); - let amount = peers.len(); - if amount == 0 { - return None; - } - let mut rng = rand::rng(); - let mut attempts = 0; - loop { - if attempts >= amount * 2 { - return None; - } - let selected = rng.random_range(0..amount); - let (peer, loc) = peers.iter().nth(selected).expect("infallible"); - if !filter_fn(peer) { - attempts += 1; - continue; - } else { - return Some(PeerKeyLocation { - peer: peer.clone(), - location: Some(*loc), - }); - } - } - } - /// Route an op to the most optimal target. pub fn routing( &self, diff --git a/crates/core/src/ring/live_tx.rs b/crates/core/src/ring/live_tx.rs index cc1fd25f8..2a0988a1e 100644 --- a/crates/core/src/ring/live_tx.rs +++ b/crates/core/src/ring/live_tx.rs @@ -1,27 +1,13 @@ use crate::{message::Transaction, node::PeerId}; use dashmap::DashMap; use std::sync::Arc; -use tokio::sync; #[derive(Clone)] pub struct LiveTransactionTracker { tx_per_peer: Arc>>, - missing_candidate_sender: sync::mpsc::Sender, } impl LiveTransactionTracker { - /// The given peer does not have (good) candidates for acquiring new connections. - pub async fn missing_candidate_peers(&self, peer: PeerId) { - let _ = self - .missing_candidate_sender - .send(peer) - .await - .map_err(|error| { - tracing::debug!(%error, "live transaction tracker channel closed"); - error - }); - } - pub fn add_transaction(&self, peer: PeerId, tx: Transaction) { self.tx_per_peer.entry(peer).or_default().push(tx); } @@ -42,15 +28,10 @@ impl LiveTransactionTracker { } } - pub(crate) fn new() -> (Self, sync::mpsc::Receiver) { - let (missing_peer, rx) = sync::mpsc::channel(10); - ( - Self { - tx_per_peer: Arc::new(DashMap::default()), - missing_candidate_sender: missing_peer, - }, - rx, - ) + pub(crate) fn new() -> Self { + Self { + tx_per_peer: Arc::new(DashMap::default()), + } } pub(crate) fn prune_transactions_from_peer(&self, peer: &PeerId) { diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index 22f41f64a..3ac8c59a7 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -6,23 +6,18 @@ use std::collections::{BTreeSet, HashSet}; use std::net::SocketAddr; use std::{ - cmp::Reverse, - collections::BTreeMap, sync::{ atomic::{AtomicU64, AtomicUsize}, Arc, Weak, }, time::{Duration, Instant}, }; -use tokio::sync::mpsc::{self, error::TryRecvError}; use tracing::Instrument; use dashmap::mapref::one::Ref as DmRef; use either::Either; use freenet_stdlib::prelude::ContractKey; -use itertools::Itertools; use parking_lot::RwLock; -use rand::Rng; use crate::message::TransactionType; use crate::topology::rate::Rate; @@ -104,7 +99,7 @@ impl Ring { is_gateway: bool, connection_manager: ConnectionManager, ) -> anyhow::Result> { - let (live_tx_tracker, missing_candidate_rx) = LiveTransactionTracker::new(); + let live_tx_tracker = LiveTransactionTracker::new(); let max_hops_to_live = if let Some(v) = config.max_hops_to_live { v @@ -144,7 +139,7 @@ impl Ring { GlobalExecutor::spawn( ring.clone() - .connection_maintenance(event_loop_notifier, live_tx_tracker, missing_candidate_rx) + .connection_maintenance(event_loop_notifier, live_tx_tracker) .instrument(span), ); Ok(ring) @@ -384,47 +379,10 @@ impl Ring { .await; } - pub fn closest_to_location( - &self, - location: Location, - skip_list: HashSet, - ) -> Option { - let connections = self.connection_manager.get_connections_by_location(); - if tracing::enabled!(tracing::Level::DEBUG) { - let total_peers: usize = connections.values().map(|v| v.len()).sum(); - tracing::debug!( - unique_locations = connections.len(), - total_peers = total_peers, - skip_list_size = skip_list.len(), - target_location = %location, - "Looking for closest peer to location" - ); - for (loc, peers) in &connections { - tracing::debug!(location = %loc, peer_count = peers.len(), "Location has peers"); - } - } - connections - .iter() - .sorted_by(|(loc_a, _), (loc_b, _)| { - loc_a.distance(location).cmp(&loc_b.distance(location)) - }) - .find_map(|(_, conns)| { - // Try all peers at this location, not just random sampling - for conn in conns { - if !skip_list.contains(&conn.location.peer) { - tracing::debug!(selected_peer = %conn.location.peer, "Found closest peer"); - return Some(conn.location.clone()); - } - } - None - }) - } - async fn connection_maintenance( self: Arc, notifier: EventLoopNotificationsSender, live_tx_tracker: LiveTransactionTracker, - mut missing_candidates: mpsc::Receiver, ) -> anyhow::Result<()> { tracing::info!("Initializing connection maintenance task"); let is_gateway = self.is_gateway; @@ -446,13 +404,6 @@ impl Ring { let mut refresh_density_map = tokio::time::interval(REGENERATE_DENSITY_MAP_INTERVAL); refresh_density_map.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - let mut missing = BTreeMap::new(); - - #[cfg(not(test))] - let retry_peers_missing_candidates_interval = Duration::from_secs(60 * 5) * 2; - #[cfg(test)] - let retry_peers_missing_candidates_interval = Duration::from_secs(5); - // if the peer is just starting wait a bit before // we even attempt acquiring more connections tokio::time::sleep(Duration::from_secs(2)).await; @@ -477,28 +428,8 @@ impl Ring { this_peer = Some(peer); continue; }; - loop { - match missing_candidates.try_recv() { - Ok(missing_candidate) => { - missing.insert(Reverse(Instant::now()), missing_candidate); - } - Err(TryRecvError::Empty) => break, - Err(TryRecvError::Disconnected) => { - tracing::debug!("Shutting down connection maintenance"); - anyhow::bail!("finished"); - } - } - } - - // eventually peers which failed to return candidates should be retried when enough time has passed - let retry_missing_candidates_until = - Instant::now() - retry_peers_missing_candidates_interval; - - // remove all missing candidates which have been retried - missing.split_off(&Reverse(retry_missing_candidates_until)); - // avoid connecting to the same peer multiple times - let mut skip_list: HashSet<_> = missing.values().collect(); + let mut skip_list = HashSet::new(); skip_list.insert(this_peer); // if there are no open connections, we need to acquire more @@ -789,6 +720,4 @@ pub(crate) enum RingError { EmptyRing, #[error("Ran out of, or haven't found any, caching peers for contract {0}")] NoCachingPeers(ContractKey), - #[error("No location assigned to this peer")] - NoLocation, } diff --git a/crates/core/src/transport/connection_handler.rs b/crates/core/src/transport/connection_handler.rs index 90899c148..c9aa84132 100644 --- a/crates/core/src/transport/connection_handler.rs +++ b/crates/core/src/transport/connection_handler.rs @@ -113,15 +113,6 @@ pub(crate) struct InboundConnectionHandler { new_connection_notifier: mpsc::Receiver, } -#[cfg(test)] -impl InboundConnectionHandler { - pub fn new(new_connection_notifier: mpsc::Receiver) -> Self { - InboundConnectionHandler { - new_connection_notifier, - } - } -} - impl InboundConnectionHandler { pub async fn next_connection(&mut self) -> Option { self.new_connection_notifier.recv().await @@ -135,16 +126,6 @@ pub(crate) struct OutboundConnectionHandler { expected_non_gateway: Arc>, } -#[cfg(test)] -impl OutboundConnectionHandler { - pub fn new(send_queue: mpsc::Sender<(SocketAddr, ConnectionEvent)>) -> Self { - OutboundConnectionHandler { - send_queue, - expected_non_gateway: Arc::new(DashSet::new()), - } - } -} - impl OutboundConnectionHandler { fn config_listener( socket: Arc, diff --git a/crates/core/src/transport/mod.rs b/crates/core/src/transport/mod.rs index 04ca4dc0c..d833a27cf 100644 --- a/crates/core/src/transport/mod.rs +++ b/crates/core/src/transport/mod.rs @@ -26,13 +26,6 @@ type MessagePayload = Vec; type PacketId = u32; pub use self::crypto::{TransportKeypair, TransportPublicKey}; -#[cfg(test)] -pub(crate) use self::{ - connection_handler::ConnectionEvent, - packet_data::{PacketData, UnknownEncryption}, - peer_connection::RemoteConnection, - symmetric_message::{SymmetricMessage, SymmetricMessagePayload}, -}; pub(crate) use self::{ connection_handler::{ create_connection_handler, InboundConnectionHandler, OutboundConnectionHandler, diff --git a/crates/core/src/transport/packet_data.rs b/crates/core/src/transport/packet_data.rs index 2128a8a4f..a189e1635 100644 --- a/crates/core/src/transport/packet_data.rs +++ b/crates/core/src/transport/packet_data.rs @@ -182,17 +182,6 @@ impl PacketData { } } -#[cfg(test)] -impl PacketData { - pub fn into_unknown(self) -> PacketData { - PacketData { - data: self.data, - size: self.size, - data_type: PhantomData, - } - } -} - impl PacketData { pub fn from_buf(buf: impl AsRef<[u8]>) -> Self { let mut data = [0; N]; diff --git a/crates/core/src/transport/peer_connection.rs b/crates/core/src/transport/peer_connection.rs index e96447dab..cce5bc949 100644 --- a/crates/core/src/transport/peer_connection.rs +++ b/crates/core/src/transport/peer_connection.rs @@ -122,20 +122,6 @@ impl Drop for PeerConnection { } } -#[cfg(test)] -type PeerConnectionMock = ( - PeerConnection, - mpsc::Sender>, - mpsc::Receiver<(SocketAddr, Arc<[u8]>)>, -); - -#[cfg(test)] -type RemoteConnectionMock = ( - RemoteConnection, - mpsc::Sender>, - mpsc::Receiver<(SocketAddr, Arc<[u8]>)>, -); - impl PeerConnection { pub(super) fn new(remote_conn: RemoteConnection) -> Self { const KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10); @@ -249,69 +235,6 @@ impl PeerConnection { } } - #[cfg(test)] - pub(crate) fn new_test( - remote_addr: SocketAddr, - my_address: SocketAddr, - outbound_symmetric_key: Aes128Gcm, - inbound_symmetric_key: Aes128Gcm, - ) -> PeerConnectionMock { - use crate::transport::crypto::TransportKeypair; - use parking_lot::Mutex; - let (outbound_packets, outbound_packets_recv) = mpsc::channel(100); - let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(100); - let keypair = TransportKeypair::new(); - let remote = RemoteConnection { - outbound_packets, - outbound_symmetric_key, - remote_addr, - sent_tracker: Arc::new(Mutex::new(SentPacketTracker::new())), - last_packet_id: Arc::new(AtomicU32::new(0)), - inbound_packet_recv, - inbound_symmetric_key, - inbound_symmetric_key_bytes: [1; 16], - my_address: Some(my_address), - transport_secret_key: keypair.secret, - bandwidth_limit: None, - }; - ( - Self::new(remote), - inbound_packet_sender, - outbound_packets_recv, - ) - } - - #[cfg(test)] - pub(crate) fn new_remote_test( - remote_addr: SocketAddr, - my_address: SocketAddr, - outbound_symmetric_key: Aes128Gcm, - inbound_symmetric_key: Aes128Gcm, - ) -> RemoteConnectionMock { - use crate::transport::crypto::TransportKeypair; - use parking_lot::Mutex; - let (outbound_packets, outbound_packets_recv) = mpsc::channel(100); - let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(100); - let keypair = TransportKeypair::new(); - ( - RemoteConnection { - outbound_packets, - outbound_symmetric_key, - remote_addr, - sent_tracker: Arc::new(Mutex::new(SentPacketTracker::new())), - last_packet_id: Arc::new(AtomicU32::new(0)), - inbound_packet_recv, - inbound_symmetric_key, - inbound_symmetric_key_bytes: [1; 16], - my_address: Some(my_address), - transport_secret_key: keypair.secret, - bandwidth_limit: None, - }, - inbound_packet_sender, - outbound_packets_recv, - ) - } - #[instrument(name = "peer_connection", skip_all)] pub async fn send(&mut self, data: T) -> Result where