From a9748a9e9800f79432451114518d71bb3cf39d0d Mon Sep 17 00:00:00 2001 From: Olaoluwa Osuntokun Date: Thu, 2 Oct 2025 20:11:40 -0700 Subject: [PATCH 1/4] rfq: add client-side keepalive to price oracle connections In this commit, we add comprehensive client-side gRPC keepalive parameters to the price oracle client connections. This addresses the root cause of issue #1814 where connections to price oracle servers were being silently closed after idle periods, resulting in "connection reset by peer" errors during RFQ operations. The key change is adding PermitWithoutStream set to true, which allows the client to send keepalive pings even when there are no active RPC calls. This is essential for long-lived connections that may experience extended idle periods between price queries. Without this setting, idle connections would be closed by intermediaries or the server itself, leaving the client unaware of the broken connection until the next RPC attempt. We configure the client to ping the server every 30 seconds of inactivity and wait 20 seconds for a response. These values are conservative enough to detect connection issues quickly while avoiding excessive network traffic. The same keepalive parameters are applied to both TLS and insecure (testing-only) connection modes to ensure consistent behavior. Fixes #1814 --- rfq/oracle.go | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/rfq/oracle.go b/rfq/oracle.go index e7ee286945..96b3c565fa 100644 --- a/rfq/oracle.go +++ b/rfq/oracle.go @@ -18,6 +18,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/keepalive" ) // PriceQueryIntent is an enum that represents the intent of a price rate @@ -186,16 +187,38 @@ type RpcPriceOracle struct { rawConn *grpc.ClientConn } +// clientKeepaliveDialOption configures bidirectional health probing to prevent +// idle RFQ connections from being silently terminated by network intermediaries +// (NATs, load balancers) or aggressive server timeouts. Without active +// keepalive, the first price query after an idle period would fail with +// "connection reset by peer" and require a retry. +var clientKeepaliveDialOption = grpc.WithKeepaliveParams( + keepalive.ClientParameters{ + // Ping server after 30 seconds of inactivity. + Time: 30 * time.Second, + + // Wait 20 seconds for ping response. + Timeout: 20 * time.Second, + + // Permit keepalive pings even when there are no active + // streams. This is critical for long-lived connections with + // infrequent RFQ requests. + PermitWithoutStream: true, + }, +) + // serverDialOpts returns the set of server options needed to connect to the // price oracle RPC server using a TLS connection. func serverDialOpts() ([]grpc.DialOption, error) { var opts []grpc.DialOption - // Skip TLS certificate verification. tlsConfig := tls.Config{InsecureSkipVerify: true} transportCredentials := credentials.NewTLS(&tlsConfig) + opts = append(opts, grpc.WithTransportCredentials(transportCredentials)) + opts = append(opts, clientKeepaliveDialOption) + return opts, nil } @@ -209,6 +232,8 @@ func insecureServerDialOpts() ([]grpc.DialOption, error) { insecure.NewCredentials(), )) + opts = append(opts, clientKeepaliveDialOption) + return opts, nil } From b19a3f4c73a3418e77b7f03c66f97c524113a9db Mon Sep 17 00:00:00 2001 From: Olaoluwa Osuntokun Date: Thu, 2 Oct 2025 20:12:22 -0700 Subject: [PATCH 2/4] server: enhance gRPC keepalive configuration In this commit, we enhance the server-side gRPC keepalive configuration to work in coordination with the client-side keepalive settings added in the previous commit. This completes the fix for issue #1814 by ensuring both sides of the connection actively maintain connection health. Previously, the server only configured MaxConnectionIdle set to 2 minutes, which would aggressively close idle connections. This caused problems for price oracle connections that could be idle for extended periods between RFQ operations. We now extend MaxConnectionIdle to 24 hours and add active health checking through Time and Timeout parameters. The critical addition is the EnforcementPolicy with PermitWithoutStream set to true. This allows clients to send keepalive pings even when no RPC calls are active, which is essential for long-lived connections. Without this policy, the server would reject client keepalive pings on idle connections, defeating the purpose of the client-side keepalive configuration. These settings follow the same pattern used by lnd and are based on gRPC's official keepalive recommendations. The combination of active pinging from both client and server, along with permissive policies, ensures connections remain healthy and any network issues are detected promptly rather than discovered only when the next RPC fails. Fixes #1814 --- server.go | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/server.go b/server.go index 23698a4d79..ecabc76b5b 100644 --- a/server.go +++ b/server.go @@ -370,11 +370,34 @@ func (s *Server) RunUntilShutdown(mainErrChan <-chan error) error { serverOpts = append(serverOpts, rpcServerOpts...) serverOpts = append(serverOpts, ServerMaxMsgReceiveSize) - keepAliveParams := keepalive.ServerParameters{ - MaxConnectionIdle: time.Minute * 2, - } - - serverOpts = append(serverOpts, grpc.KeepaliveParams(keepAliveParams)) + // Configure server-side keepalive parameters. These settings allow the + // server to actively probe the connection health and ensure connections + // stay alive during idle periods. + serverKeepalive := keepalive.ServerParameters{ + // Ping client after 1 minute of inactivity. + Time: time.Minute, + // Wait 20 seconds for ping response. + Timeout: 20 * time.Second, + // Allow connections to remain idle for extended periods. This + // is particularly important for RFQ operations where price + // oracle connections may be idle for long periods. + MaxConnectionIdle: time.Hour * 24, + } + + // Configure client enforcement policy. This allows clients to send + // keepalive pings even when there are no active streams, which is + // crucial for long-lived connections with infrequent activity. + clientKeepalive := keepalive.EnforcementPolicy{ + // Minimum time between client pings. + MinTime: 5 * time.Second, + // Allow pings without active RPCs. + PermitWithoutStream: true, + } + + serverOpts = append( + serverOpts, grpc.KeepaliveParams(serverKeepalive), + grpc.KeepaliveEnforcementPolicy(clientKeepalive), + ) grpcServer := grpc.NewServer(serverOpts...) defer grpcServer.Stop() From 48dd110ccae43757553122a781340b81fb5cd0f1 Mon Sep 17 00:00:00 2001 From: Olaoluwa Osuntokun Date: Thu, 9 Oct 2025 18:38:11 +0100 Subject: [PATCH 3/4] docs/examples/basic-price-oracle: add keep alive settings to example --- docs/examples/basic-price-oracle/main.go | 38 ++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/examples/basic-price-oracle/main.go b/docs/examples/basic-price-oracle/main.go index 83609fc7c9..4fde54c0c8 100644 --- a/docs/examples/basic-price-oracle/main.go +++ b/docs/examples/basic-price-oracle/main.go @@ -29,6 +29,7 @@ import ( "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials" + "google.golang.org/grpc/keepalive" ) const ( @@ -455,11 +456,44 @@ func main() { log.Fatalf("Failed to generate TLS certificate: %v", err) } - // Create the gRPC server with TLS + // Configure server-side keepalive parameters. These settings ensure the + // server actively probes client connection health and allows long-lived + // idle connections. + serverKeepalive := keepalive.ServerParameters{ + // Ping clients after 1 minute of inactivity. + Time: time.Minute, + + // Wait 20 seconds for ping response. + Timeout: 20 * time.Second, + + // Allow connections to stay idle for 24 hours. The active + // pinging mechanism (via Time parameter) handles health + // checking, so we don't need aggressive idle timeouts. + MaxConnectionIdle: time.Hour * 24, + } + + // Configure client keepalive enforcement policy. This tells the server + // how to handle client keepalive pings. + clientKeepalive := keepalive.EnforcementPolicy{ + // Allow client to ping even when there are no active RPCs. + // This is critical for long-lived connections with infrequent + // price queries. + PermitWithoutStream: true, + + // Prevent abusive clients from pinging too frequently (DoS + // protection). + MinTime: 5 * time.Second, + } + + // Create the gRPC server with TLS and keepalive configuration. transportCredentials := credentials.NewTLS(&tls.Config{ Certificates: []tls.Certificate{tlsCert}, }) - backendService := grpc.NewServer(grpc.Creds(transportCredentials)) + backendService := grpc.NewServer( + grpc.Creds(transportCredentials), + grpc.KeepaliveParams(serverKeepalive), + grpc.KeepaliveEnforcementPolicy(clientKeepalive), + ) err = startService(backendService) if err != nil { From ba070420aab2ee70b27ab3c45837621c8fcad4b9 Mon Sep 17 00:00:00 2001 From: Olaoluwa Osuntokun Date: Thu, 9 Oct 2025 18:40:06 +0100 Subject: [PATCH 4/4] docs/release-notes: add release notes entry --- docs/release-notes/release-notes-0.7.0.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/release-notes/release-notes-0.7.0.md b/docs/release-notes/release-notes-0.7.0.md index 92eba52509..1710804fbb 100644 --- a/docs/release-notes/release-notes-0.7.0.md +++ b/docs/release-notes/release-notes-0.7.0.md @@ -63,6 +63,18 @@ dirty. This issue has been resolved, and the behavior is now consistent across all database backend types. +- [Fixed "connection reset by peer" errors in RFQ price oracle + connections](https://github.com/lightninglabs/taproot-assets/pull/1834) by + implementing comprehensive bidirectional gRPC keepalive configuration. The + issue occurred when connections sat idle between RFQ price queries and were + silently closed by the network layer or server timeout, causing the first + payment attempt after an idle period to fail. The fix adds client-side + keepalive pings every 30 seconds and extends the server's idle connection + timeout from 2 minutes to 24 hours, while enabling active health checking on + both sides. This ensures connections remain alive during infrequent RFQ + operations and any network issues are detected promptly rather than + discovered only when the next RPC fails. + # New Features ## Functional Enhancements