Merge pull request #10553 from Icinga/jsonrpc-utilization-metrics

julianbrost · web-flow · commit 3674af75de87 · 2025-09-23T16:05:44.000+02:00
Add JSON-RPC utilization metrics and troubleshooting documentation
diff --git a/doc/15-troubleshooting.md b/doc/15-troubleshooting.md
@@ -2055,3 +2055,40 @@ Value data: 65534
 
 More details in [this blogpost](https://www.netways.de/blog/2019/01/24/windows-blocking-icinga-2-with-ephemeral-port-range/)
 and this [MS help entry](https://support.microsoft.com/en-us/help/196271/when-you-try-to-connect-from-tcp-ports-greater-than-5000-you-receive-t).
+
+### Cluster Troubleshooting: Overloaded JSON-RPC Connections <a id="troubleshooting-jsonrpc-overload"></a>
+
+If JSON-RPC connections are overloaded, messages are processed with a delay. This can show in symptoms like the master
+lagging behind, for example showing check results only minutes after they were available on a satellite.
+
+There are two ways this situation can be identified:
+
+First, if a connection is overloaded, Icinga 2 will read data from it slower than it arrives, so pending messages are
+accumulating in the TCP receive queue on the overloaded endpoint and the TCP send queue of other endpoints sending to
+it. This can be checked by querying information about open TCP connections using the command
+`ss --tcp --processes --numeric`. High values for Recv-Q on a socket used by the `icinga2` process can be a hint that
+the local endpoint is not able to keep up with the messages from this connection. Note that small values (a few
+kilobytes) are perfectly fine as messages can be in flight. Also, while the replay log is received, messages are
+processed as fast as possible and thus the connection is operating at capacity, thus the size of the TCP receive queue
+is only meaningful after processing the replay log has finished.
+
+Second, Icinga 2.15.1 introduced a metric that can be used to estimate how much load there is on a particular
+connection: the `seconds_processing_messages` attribute of `Endpoint` objects which can be
+[queried using the API](12-icinga2-api.md#icinga2-api-config-objects-query). This value accumulates the total time spent
+processing JSON-RPC messages from connections to that endpoint. In order to interpret this number, you have to query it
+at least twice and calculate the rate at which the number increased. For example, a rate of 0.4 (increases by 0.4s every
+second) means that the connection is at around 40% of its maximum capacity. In practice, the rate will never reach the
+theoretical maximum of 1 as there's also some time spent reading the messages, so if it's close to 1, the connection
+might be overloaded or is close to its capacity limit.
+
+This limit in capacity exists because all there can be implicit dependencies between different JSON-RPC messages,
+requiring them to be processed in the same order that they were sent. This is currently ensured by processing all
+messages from the same connection sequentially.
+
+To work around this limit, the following approaches are possible:
+1. Try to redistribute load between connections, for example if the overloaded connection is between the master and
+   a satellite zone, try splitting this zone into two, distributing the load across two connections.
+2. Reduce the load on that connection. Typically, the most frequent message type will be check results, so reducing
+   the check interval can be a first step.
+3. As the messages are processed sequentially, the throughput is limited by the single core CPU performance of the
+   machine Icinga 2 is running on, switching to a more powerful one can increase the capacity of individual connections.
diff --git a/lib/base/atomic.hpp b/lib/base/atomic.hpp
@@ -4,6 +4,7 @@
 #define ATOMIC_H
 
 #include <atomic>
+#include <chrono>
 #include <mutex>
 #include <type_traits>
 #include <utility>
@@ -34,6 +35,43 @@ class Atomic : public std::atomic<T> {
 	}
 };
 
+/**
+ * Accumulates time durations atomically.
+ *
+ * @ingroup base
+ */
+class AtomicDuration
+{
+public:
+	using Clock = std::chrono::steady_clock;
+
+	/**
+	 * Adds the elapsedTime to this instance.
+	 *
+	 * May be called multiple times to accumulate time.
+	 *
+	 * @param elapsedTime The distance between two time points
+	 *
+	 * @return This instance for method chaining
+	 */
+	AtomicDuration& operator+=(const Clock::duration& elapsedTime) noexcept
+	{
+		m_Sum.fetch_add(elapsedTime.count(), std::memory_order_relaxed);
+		return *this;
+	}
+
+	/**
+	 * @return The total accumulated time in seconds
+	 */
+	operator double() const noexcept
+	{
+		return std::chrono::duration<double>(Clock::duration(m_Sum.load(std::memory_order_relaxed))).count();
+	}
+
+private:
+	Atomic<Clock::duration::rep> m_Sum {0};
+};
+
 /**
  * Wraps any T into a std::atomic<T>-like interface that locks using a mutex.
  *
diff --git a/lib/remote/endpoint.cpp b/lib/remote/endpoint.cpp
@@ -130,6 +130,11 @@ void Endpoint::AddMessageReceived(const intrusive_ptr<ApiFunction>& method)
 	m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed);
 }
 
+void Endpoint::AddMessageProcessed(const AtomicDuration::Clock::duration& duration)
+{
+	m_InputProcessingTime += duration;
+}
+
 double Endpoint::GetMessagesSentPerSecond() const
 {
 	return m_MessagesSent.CalculateRate(Utility::GetTime(), 60);
@@ -162,3 +167,8 @@ Dictionary::Ptr Endpoint::GetMessagesReceivedPerType() const
 
 	return new Dictionary(std::move(result));
 }
+
+double Endpoint::GetSecondsProcessingMessages() const
+{
+	return m_InputProcessingTime;
+}
diff --git a/lib/remote/endpoint.hpp b/lib/remote/endpoint.hpp
@@ -49,6 +49,7 @@ class Endpoint final : public ObjectImpl<Endpoint>
 	void AddMessageSent(int bytes);
 	void AddMessageReceived(int bytes);
 	void AddMessageReceived(const intrusive_ptr<ApiFunction>& method);
+	void AddMessageProcessed(const AtomicDuration::Clock::duration& duration);
 
 	double GetMessagesSentPerSecond() const override;
 	double GetMessagesReceivedPerSecond() const override;
@@ -58,6 +59,8 @@ class Endpoint final : public ObjectImpl<Endpoint>
 
 	Dictionary::Ptr GetMessagesReceivedPerType() const override;
 
+	double GetSecondsProcessingMessages() const override;
+
 protected:
 	void OnAllConfigLoaded() override;
 
@@ -71,6 +74,8 @@ class Endpoint final : public ObjectImpl<Endpoint>
 	mutable RingBuffer m_MessagesReceived{60};
 	mutable RingBuffer m_BytesSent{60};
 	mutable RingBuffer m_BytesReceived{60};
+
+	AtomicDuration m_InputProcessingTime;
 };
 
 }
diff --git a/lib/remote/endpoint.ti b/lib/remote/endpoint.ti
@@ -58,6 +58,10 @@ class Endpoint : ConfigObject
 	[no_user_modify, no_storage] Dictionary::Ptr messages_received_per_type {
 		get;
 	};
+
+	[no_user_modify, no_storage] double seconds_processing_messages {
+		get;
+	};
 };
 
 }
diff --git a/lib/remote/jsonrpcconnection.cpp b/lib/remote/jsonrpcconnection.cpp
@@ -106,6 +106,9 @@ void JsonRpcConnection::HandleIncomingMessages(boost::asio::yield_context yc)
 			l_TaskStats.InsertValue(Utility::GetTime(), 1);
 
 			auto total = ch::steady_clock::now() - start;
+			if (m_Endpoint) {
+				m_Endpoint->AddMessageProcessed(total);
+			}
 
 			Log msg(total >= ch::seconds(5) ? LogWarning : LogDebug, "JsonRpcConnection");
 			msg << "Processed JSON-RPC '" << rpcMethod << "' message for identity '" << m_Identity
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -89,6 +89,7 @@ set(base_test_SOURCES
   icingaapplication-fixture.cpp
   utils.cpp
   base-array.cpp
+  base-atomic.cpp
   base-base64.cpp
   base-convert.cpp
   base-dictionary.cpp
diff --git a/test/base-atomic.cpp b/test/base-atomic.cpp
@@ -0,0 +1,34 @@
+/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */
+
+#include "base/atomic.hpp"
+#include <BoostTestTargetConfig.h>
+
+using namespace icinga;
+
+BOOST_AUTO_TEST_SUITE(base_atomic)
+
+BOOST_AUTO_TEST_CASE(duration_none)
+{
+	BOOST_CHECK_EQUAL(static_cast<double>(AtomicDuration()), 0);
+}
+
+BOOST_AUTO_TEST_CASE(duration_one)
+{
+	AtomicDuration sum;
+
+	sum += std::chrono::seconds(1);
+
+	BOOST_CHECK_EQUAL(static_cast<double>(sum), 1);
+}
+
+BOOST_AUTO_TEST_CASE(duration_two)
+{
+	AtomicDuration sum;
+
+	sum += std::chrono::seconds(1);
+	sum += std::chrono::seconds(2);
+
+	BOOST_CHECK_EQUAL(static_cast<double>(sum), 3);
+}
+
+BOOST_AUTO_TEST_SUITE_END()

Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,11 @@ void Endpoint::AddMessageReceived(const intrusive_ptr<ApiFunction>& method)`
`130`	`130`	`m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed);`
`131`	`131`	`}`
`132`	`132`
	`133`	`+void Endpoint::AddMessageProcessed(const AtomicDuration::Clock::duration& duration)`
	`134`	`+{`
	`135`	`+ m_InputProcessingTime += duration;`
	`136`	`+}`
	`137`	`+`
`133`	`138`	`double Endpoint::GetMessagesSentPerSecond() const`
`134`	`139`	`{`
`135`	`140`	`return m_MessagesSent.CalculateRate(Utility::GetTime(), 60);`
`@@ -162,3 +167,8 @@ Dictionary::Ptr Endpoint::GetMessagesReceivedPerType() const`
`162`	`167`
`163`	`168`	`return new Dictionary(std::move(result));`
`164`	`169`	`}`
	`170`	`+`
	`171`	`+double Endpoint::GetSecondsProcessingMessages() const`
	`172`	`+{`
	`173`	`+ return m_InputProcessingTime;`
	`174`	`+}`