diff --git a/doc/telemetry.md b/doc/telemetry.md new file mode 100644 index 0000000..9e2fcec --- /dev/null +++ b/doc/telemetry.md @@ -0,0 +1,34 @@ +# Telemetry in Taskmanager + +The TaskManager application is equipped with OpenTelemetry capabilities. +That means that it supports the use of an OpenTelemetry java agent, even though it is not manually instrumented. + +## Traces + +When using the OpenTelemetry java agent, use of the RabbitMQ library within TaskManager should ensure that spans are created on task pickup/delivery. +Taskmanager will try to keep traces per task intact as much as possible, for instance when switching threads. +Taskmanager does not explicitly defines spans itself currently, it relies on the automatic spans created when using the java agent. + +## Metrics + +The TaskManager defines a few custom metrics for OpenTelemetry to capture. +These metrics are all defined with the `nl.aerius.TaskManager` instrumentation scope. + +| metric name | type | description | +|-----------------------------------------|-----------|----------------------------------------------------------------------| +| `aer.taskmanager.worker_size`1 | gauge | The number of workers that are configured according to Taskmanager. | +| `aer.taskmanager.current_worker_size`1 | gauge | The number of workers that are current in Taskmanager. | +| `aer.taskmanager.running_worker_size`1 | gauge | The number of workers that are occupied in Taskmanager. | +| `aer.taskmanager.running_client_size`2 | gauge | The number of workers that are occupied for a specific client queue. | +| `aer.taskmanager.dispatched`1 | histogram | The number of tasks dispatched. | +| `aer.taskmanager.dispatched.wait`1 | histogram | The average wait time of tasks dispatched. | +| `aer.taskmanager.dispatched.queue`2 | histogram | The number of tasks dispatched per client queue. | +| `aer.taskmanager.dispatched.queue.wait`2 | histogram | The average wait time of tasks dispatched per client queue. | +| `aer.taskmanager.work.load`1 | gauge | Percentage of workers used in the timeframe (1 minute). | + +The workers have different attributes to distinguish specific metrics. +* 1 have attribute `worker_type`. +* 2 have attribute `worker_type` and `queue_name`. + +`worker_type` is the type of worker, e.g. `ops`. +`queue_name` is the originating queue the task initially was put on, e.g. `...calculator_ui_small`. diff --git a/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityQueueMap.java b/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityQueueMap.java index 1e47b06..971e4c2 100644 --- a/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityQueueMap.java +++ b/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityQueueMap.java @@ -71,7 +71,10 @@ public void incrementOnWorker(final TaskRecord taskRecord) { } public int onWorkerTotal(final String queueName) { - return tasksOnWorkersPerQueue.entrySet().stream().filter(e -> keyMapper.queueName(e.getKey()).equals(queueName)).mapToInt(e -> e.getValue().get()).sum(); + return tasksOnWorkersPerQueue.entrySet().stream() + .filter(e -> keyMapper.queueName(e.getKey()).equals(queueName)) + .mapToInt(e -> e.getValue().get()) + .sum(); } public int onWorker(final TaskRecord taskRecord) { diff --git a/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityTaskSchedulerMetrics.java b/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityTaskSchedulerMetrics.java index 728cb59..9f3f945 100644 --- a/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityTaskSchedulerMetrics.java +++ b/source/taskmanager/src/main/java/nl/aerius/taskmanager/scheduler/priorityqueue/PriorityTaskSchedulerMetrics.java @@ -20,8 +20,6 @@ import java.util.Map; import java.util.function.IntSupplier; -import io.opentelemetry.api.common.AttributeKey; -import io.opentelemetry.api.common.Attributes; import io.opentelemetry.api.metrics.ObservableDoubleGauge; import nl.aerius.taskmanager.metrics.OpenTelemetryMetrics; @@ -47,7 +45,8 @@ public void addMetric(final IntSupplier countSupplier, final String workerQueueN metrics.put(clientQueueName, OpenTelemetryMetrics.METER .gaugeBuilder(METRIC_PREFIX) .setDescription(DESCRIPTION) - .buildWithCallback(result -> result.record(countSupplier.getAsInt(), workerDefaultAttributes(workerQueueName, clientQueueName)))); + .buildWithCallback( + result -> result.record(countSupplier.getAsInt(), OpenTelemetryMetrics.queueAttributes(workerQueueName, clientQueueName)))); } /** @@ -60,11 +59,4 @@ public void removeMetric(final String clienQueueName) { metrics.remove(clienQueueName).close(); } } - - private static Attributes workerDefaultAttributes(final String workerQueueName, final String clientQueueName) { - return Attributes.builder() - .put(AttributeKey.stringKey("worker_type"), workerQueueName) - .put(AttributeKey.stringKey("client_queue_name"), clientQueueName) - .build(); - } } diff --git a/source/taskmanager/telemetry.md b/source/taskmanager/telemetry.md deleted file mode 100644 index 136474e..0000000 --- a/source/taskmanager/telemetry.md +++ /dev/null @@ -1,26 +0,0 @@ -# Telemetry in Taskmanager - -The TaskManager application is equipped with OpenTelemetry capabilities. -That means that it supports the use of an OpenTelemetry java agent, even though it is not manually instrumented. - -## Traces - -When using the OpenTelemetry java agent, use of the RabbitMQ library within TaskManager should ensure that spans are created on task pickup/delivery. -Taskmanager will try to keep traces per task intact as much as possible, for instance when switching threads. -Taskmanager does not explicitly defines spans itself currently, it relies on the automatic spans created when using the java agent. - -## Metrics - -The TaskManager defines a few custom metrics for OpenTelemetry to capture. -These metrics are all defined with the `nl.aerius.TaskManager` instrumentation scope. - -| metric name | type | description | -|---------------------------------------|---------|----------------------------------------------------------------------| -| `aer.taskmanager.worker_size` | gauge | The number of workers that are configured according to Taskmanager. | -| `aer.taskmanager.current_worker_size` | gauge | The number of workers that are current in Taskmanager. | -| `aer.taskmanager.running_worker_size` | gauge | The number of workers that are occupied in Taskmanager. | -| `aer.taskmanager.running_client_size` | gauge | The number of workers that are occupied for a specific client queue. | -| `aer.taskmanager.dispatched` | counter | The number of tasks dispatched. | - -Current metrics are, per configured queue/worker type (through attribute `worker_type`): -The gauge metric `aer.taskmanger.running_client_size` also has the attribute `client_queue_name` containing the name of the client queue.