- 
                Notifications
    You must be signed in to change notification settings 
- Fork 597
Description
Hi,
I have some suggestions for improving the ElasticsearchWriter.
Currently the ElasticsearchWriter creates a new field for every metric in a given check command.
This leads to a vast number of fields in the Elasticsearch/OpenSearch indices.
Here are some examples:
{
  "_index": "icinga",
  "_id": "UQdo5ZcByMLqDokf4UFI",
  "_source": {
    "@timestamp": "2025-07-07T15:02:20.573+0000",
    "check_command": "load",
    "check_result.check_source": "homestead",
    "check_result.command": [
      "/usr/lib/nagios/plugins/check_load",
      "-c",
      "10,6,4",
      "-w",
      "5,4,3"
    ],
    "check_result.execution_end": "2025-07-07T15:02:20.573+0000",
    "check_result.execution_start": "2025-07-07T15:02:20.570+0000",
    "check_result.execution_time": 0.0027740001678466797,
    "check_result.exit_status": 0,
    "check_result.latency": 0.0003490447998046875,
    "check_result.output": "OK - load average: 0.24, 0.25, 0.26",
    "check_result.perfdata.load1.crit": 10,
    "check_result.perfdata.load1.min": 0,
    "check_result.perfdata.load1.value": 0.24,
    "check_result.perfdata.load1.warn": 5,
    "check_result.perfdata.load15.crit": 4,
    "check_result.perfdata.load15.min": 0,
    "check_result.perfdata.load15.value": 0.26,
    "check_result.perfdata.load15.warn": 3,
    "check_result.perfdata.load5.crit": 6,
    "check_result.perfdata.load5.min": 0,
    "check_result.perfdata.load5.value": 0.25,
    "check_result.perfdata.load5.warn": 4,
    "check_result.schedule_end": "2025-07-07T15:02:20.573+0000",
    "check_result.schedule_start": "2025-07-07T15:02:20.569+0000",
    "check_result.state": 0,
    "check_result.vars_after": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "check_result.vars_before": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "current_check_attempt": 1,
    "host": "homestead",
    "last_hard_state": 0,
    "last_state": 0,
    "max_check_attempts": 5,
    "reachable": true,
    "service": "load",
    "state": 0,
    "state_type": 1,
    "timestamp": "2025-07-07T15:02:20.573+0000",
    "type": "icinga2.event.checkresult"
  }
},
{
  "_index": "icinga",
  "_id": "-j6O6pcBNgAT8LEo7v53",
  "_score": 1,
  "_source": {
    "@timestamp": "2025-07-08T15:01:56.548+0000",
    "check_command": "ping4",
    "check_result.check_source": "homestead",
    "check_result.command": [
      "/usr/lib/nagios/plugins/check_ping",
      "-4",
      "-H",
      "127.0.0.1",
      "-c",
      "200,15%",
      "-w",
      "100,5%"
    ],
    "check_result.execution_end": "2025-07-08T15:01:56.548+0000",
    "check_result.execution_start": "2025-07-08T15:01:52.466+0000",
    "check_result.execution_time": 4.081341981887817,
    "check_result.exit_status": 0,
    "check_result.latency": 0.0006992816925048828,
    "check_result.output": "PING OK - Packet loss = 0%, RTA = 0.07 ms",
    "check_result.perfdata.pl.crit": 15,
    "check_result.perfdata.pl.min": 0,
    "check_result.perfdata.pl.unit": "percent",
    "check_result.perfdata.pl.value": 0,
    "check_result.perfdata.pl.warn": 5,
    "check_result.perfdata.rta.crit": 0.2,
    "check_result.perfdata.rta.min": 0,
    "check_result.perfdata.rta.unit": "seconds",
    "check_result.perfdata.rta.value": 7.4e-05,
    "check_result.perfdata.rta.warn": 0.1,
    "check_result.schedule_end": "2025-07-08T15:01:56.548+0000",
    "check_result.schedule_start": "2025-07-08T15:01:52.466+0000",
    "check_result.state": 0,
    "check_result.vars_after": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "check_result.vars_before": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "current_check_attempt": 1,
    "host": "Demo1",
    "last_hard_state": 0,
    "last_state": 0,
    "max_check_attempts": 5,
    "reachable": true,
    "service": "ping4",
    "state": 0,
    "state_type": 1,
    "timestamp": "2025-07-08T15:01:56.548+0000",
    "type": "icinga2.event.checkresult"
  }
}This means, the ElasticsearchWriter is practically unusable in large Icinga2 setups, when enable_send_perfdata is enabled. Because a high amount of fields can lead to performance degradations and memory issues in clusters (See https://www.elastic.co/guide/en/elasticsearch/reference/8.18/mapping-settings-limit.html )
Elasticsearch/OpenSearch only allow 1000 fields as default, to safeguard clusters against indices that create too many fields.
As mentioned, the Icinga2 ElasticsearchWriter creates a new field for each metric in a check plugin. Each new plugin, each new metric adds new fields to the index. See also: #6805
A possible solution
One solution could be to redesign the output of the ElasticsearchWriter to use an array of nested objects for the performance data.
These are supported in both Elasticsearch and OpenSearch:
- https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/nested
- https://docs.opensearch.org/docs/latest/field-types/supported-field-types/nested/
The nested type is a specialised version of the object data type that allows arrays of objects to be indexed in a way that they can be queried independently of each other.
Example from the docs:
{
  "patients": [
    {"name" : "John Doe", "age" : 56, "smoker" : true},
    {"name" : "Mary Major", "age" : 85, "smoker" : false}
  ]
}
There are some pitfalls to look out for. By default, arrays of objects are flattened during indexing. Meaning the ElasticsearchWrite needs to make sure the nested field is used.
An example of how this would look like in Icinga2:
{
  "_index": "icinga",
  "_id": "UQdo5ZcByMLqDokf4UFI",
  "_score": 1,
  "_source": {
    "@timestamp": "2025-07-07T15:02:20.573+0000",
    "check_command": "load",
    "check_result.check_source": "homestead",
    "check_result.command": [
      "/usr/lib/nagios/plugins/check_load",
      "-c",
      "10,6,4",
      "-w",
      "5,4,3"
    ],
    "check_result.execution_end": "2025-07-07T15:02:20.573+0000",
    "check_result.execution_start": "2025-07-07T15:02:20.570+0000",
    "check_result.execution_time": 0.0027740001678466797,
    "check_result.exit_status": 0,
    "check_result.latency": 0.0003490447998046875,
    "check_result.output": "OK - load average: 0.24, 0.25, 0.26",
    "check_result.perfdata": [
        {
            "metric_name": "load1",
            "crit": 10,
            "min": 0,
            "value": 0.24,
            "warn": 5,
        },
        {
            "metric_name": "load15",
            "crit": 4,
            "min": 0,
            "value": 0.26,
            "warn": 3,
        },
        {
            "metric_name": "load5",
            "crit": 6,
            "min": 0,
            "value": 0.25,
            "warn": 4,
        },
    ]
    "check_result.schedule_end": "2025-07-07T15:02:20.573+0000",
    "check_result.schedule_start": "2025-07-07T15:02:20.569+0000",
    "check_result.state": 0,
    "check_result.vars_after": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "check_result.vars_before": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "current_check_attempt": 1,
    "host": "homestead",
    "last_hard_state": 0,
    "last_state": 0,
    "max_check_attempts": 5,
    "reachable": true,
    "service": "load",
    "state": 0,
    "state_type": 1,
    "timestamp": "2025-07-07T15:02:20.573+0000",
    "type": "icinga2.event.checkresult"
  }
}Benefits:
- Avoids creating new fields for each metric in a check plugin
- Make the index fields more predictable when searching
- Makes the perfdata independently searchable
Obviously, this would be a breaking change.
However, I don't see how the ElasticsearchWriter enable_send_perfdata would ever be usable in practice without reworking the current performance data output, regardless of how it is reworked. Since every cluster is doomed to run into the field mapping limit eventually with the current implementation, as fields cannot be removed from the mapping.
Note: I do realize that the current daily index rotation might alleviate this somewhat, until you hit a threshold with plugins/metric amount.
Regards,
Markus