From bacdf6ebe07ef3fca2259e0f28c827bbf69f4699 Mon Sep 17 00:00:00 2001 From: Gordon Murray Date: Fri, 5 Sep 2025 17:03:42 +0100 Subject: [PATCH 1/3] Added Prometheus, Grafana and starting dashboards for Minio and Flink --- Dockerfile | 5 +- conf/flink-conf.yaml | 4 + docker-compose.yml | 41 +++++++ .../provisioning/dashboards/dashboards.yml | 12 ++ .../dashboards/flink-dashboard.json | 73 +++++++++++++ .../dashboards/minio-dashboard.json | 103 ++++++++++++++++++ .../provisioning/datasources/prometheus.yml | 9 ++ monitoring/prometheus.yml | 28 +++++ 8 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 monitoring/grafana/provisioning/dashboards/flink-dashboard.json create mode 100644 monitoring/grafana/provisioning/dashboards/minio-dashboard.json create mode 100644 monitoring/grafana/provisioning/datasources/prometheus.yml create mode 100644 monitoring/prometheus.yml diff --git a/Dockerfile b/Dockerfile index 32384da..6a73bef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,4 +14,7 @@ RUN set -eux; \ RUN chown flink:flink /opt/flink/lib/paimon-*.jar /opt/flink/lib/flink-shaded-hadoop-*.jar # Verify JARs were downloaded -RUN ls -la /opt/flink/lib/paimon-* /opt/flink/lib/flink-shaded-hadoop-* \ No newline at end of file +RUN ls -la /opt/flink/lib/paimon-* /opt/flink/lib/flink-shaded-hadoop-* + +# Ensure Prometheus plugin is properly set up +RUN chown -R flink:flink /opt/flink/plugins/metrics-prometheus/ \ No newline at end of file diff --git a/conf/flink-conf.yaml b/conf/flink-conf.yaml index 8845e92..0699964 100644 --- a/conf/flink-conf.yaml +++ b/conf/flink-conf.yaml @@ -1,6 +1,7 @@ blob.server.port: 6124 s3.path-style-access: true taskmanager.memory.process.size: 1728m +metrics.reporter.prom.port: 9249 logger.kafka.level: INFO jobmanager.rpc.address: jobmanager web.cancel.enable: true @@ -16,4 +17,7 @@ s3.endpoint: http://minio:9000 parallelism.default: 1 taskmanager.numberOfTaskSlots: 4 web.submit.enable: true +metrics.reporters: prom logger.zookeeper.level: INFO +metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter +metrics.reporter.prom.host: 0.0.0.0 diff --git a/docker-compose.yml b/docker-compose.yml index 1f7cdc4..f335c25 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,6 +10,7 @@ services: environment: MINIO_ROOT_USER: admin MINIO_ROOT_PASSWORD: password123 + MINIO_PROMETHEUS_AUTH_TYPE: public command: server /data --console-address ":9001" volumes: - minio_data:/data @@ -71,6 +72,7 @@ services: command: jobmanager ports: - "8083:8081" # Flink Web UI + - "9249:9249" # Prometheus metrics environment: - | FLINK_PROPERTIES= @@ -89,6 +91,8 @@ services: build: . container_name: flink-taskmanager command: taskmanager + ports: + - "9250:9249" # Prometheus metrics (different port to avoid conflict) depends_on: jobmanager: condition: service_healthy @@ -100,9 +104,46 @@ services: volumes: - ./conf:/opt/flink/conf + # Prometheus for metrics collection + prometheus: + image: prom/prometheus:v2.45.0 + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + + # Grafana for visualization + grafana: + image: grafana/grafana:10.0.0 + container_name: grafana + ports: + - "3000:3000" + volumes: + - grafana_data:/var/lib/grafana + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + depends_on: + - prometheus + volumes: minio_data: driver: local + prometheus_data: + driver: local + grafana_data: + driver: local networks: default: diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..80bea3b --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards \ No newline at end of file diff --git a/monitoring/grafana/provisioning/dashboards/flink-dashboard.json b/monitoring/grafana/provisioning/dashboards/flink-dashboard.json new file mode 100644 index 0000000..1bdbb19 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/flink-dashboard.json @@ -0,0 +1,73 @@ +{ + "id": null, + "title": "Flink Monitoring", + "tags": ["flink"], + "style": "dark", + "timezone": "browser", + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "title": "Job Manager CPU Usage", + "type": "stat", + "targets": [ + { + "expr": "flink_jobmanager_Status_JVM_CPU_Load", + "refId": "A" + } + ], + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "percent" + } + } + }, + { + "id": 2, + "title": "Task Manager CPU Usage", + "type": "stat", + "targets": [ + { + "expr": "flink_taskmanager_Status_JVM_CPU_Load", + "refId": "A" + } + ], + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "percent" + } + } + }, + { + "id": 3, + "title": "Memory Usage", + "type": "timeseries", + "targets": [ + { + "expr": "flink_jobmanager_Status_JVM_Memory_Heap_Used", + "refId": "A", + "legendFormat": "JobManager Heap" + }, + { + "expr": "flink_taskmanager_Status_JVM_Memory_Heap_Used", + "refId": "B", + "legendFormat": "TaskManager Heap" + } + ], + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}, + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + } + } + ], + "schemaVersion": 27, + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/provisioning/dashboards/minio-dashboard.json b/monitoring/grafana/provisioning/dashboards/minio-dashboard.json new file mode 100644 index 0000000..2690338 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/minio-dashboard.json @@ -0,0 +1,103 @@ +{ + "id": null, + "title": "MinIO Monitoring", + "tags": ["minio"], + "style": "dark", + "timezone": "browser", + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "title": "Total Storage", + "type": "stat", + "targets": [ + { + "expr": "minio_cluster_capacity_raw_total_bytes", + "refId": "A" + } + ], + "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + } + }, + { + "id": 2, + "title": "Used Storage", + "type": "stat", + "targets": [ + { + "expr": "minio_cluster_capacity_usable_total_bytes - minio_cluster_capacity_usable_free_bytes", + "refId": "A" + } + ], + "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + } + }, + { + "id": 3, + "title": "Online Servers", + "type": "stat", + "targets": [ + { + "expr": "minio_cluster_nodes_online_total", + "refId": "A" + } + ], + "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "short" + } + } + }, + { + "id": 4, + "title": "Request Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(minio_s3_requests_total[5m])", + "refId": "A", + "legendFormat": "{{api}}" + } + ], + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "fieldConfig": { + "defaults": { + "unit": "reqps" + } + } + }, + { + "id": 5, + "title": "Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(minio_s3_requests_errors_total[5m])", + "refId": "A", + "legendFormat": "{{api}}" + } + ], + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "fieldConfig": { + "defaults": { + "unit": "reqps" + } + } + } + ], + "schemaVersion": 27, + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..d1792ff --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false \ No newline at end of file diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 0000000..8fc22c8 --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,28 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # Prometheus itself + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # MinIO metrics + - job_name: 'minio' + metrics_path: /minio/v2/metrics/cluster + static_configs: + - targets: ['minio:9000'] + scrape_interval: 30s + + # Flink JobManager metrics + - job_name: 'flink-jobmanager' + static_configs: + - targets: ['jobmanager:9249'] + scrape_interval: 30s + + # Flink TaskManager metrics + - job_name: 'flink-taskmanager' + static_configs: + - targets: ['taskmanager:9249'] + scrape_interval: 30s \ No newline at end of file From 4f4fb0b25b151bdbaf2ae9ad0c438b0a514abd38 Mon Sep 17 00:00:00 2001 From: Gordon Murray Date: Mon, 8 Sep 2025 11:22:50 +0100 Subject: [PATCH 2/3] Update docker-compose.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index f335c25..7502176 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -92,7 +92,7 @@ services: container_name: flink-taskmanager command: taskmanager ports: - - "9250:9249" # Prometheus metrics (different port to avoid conflict) + - "9249:9249" # Prometheus metrics depends_on: jobmanager: condition: service_healthy From ca79571eec2387d050790a7f6c4667f49d238761 Mon Sep 17 00:00:00 2001 From: Gordon Murray Date: Mon, 8 Sep 2025 11:23:03 +0100 Subject: [PATCH 3/3] Update monitoring/prometheus.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- monitoring/prometheus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index 8fc22c8..9a745ad 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -21,7 +21,7 @@ scrape_configs: - targets: ['jobmanager:9249'] scrape_interval: 30s - # Flink TaskManager metrics + # Flink TaskManager metrics - job_name: 'flink-taskmanager' static_configs: - targets: ['taskmanager:9249']