Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ RUN set -eux; \
RUN chown flink:flink /opt/flink/lib/paimon-*.jar /opt/flink/lib/flink-shaded-hadoop-*.jar

# Verify JARs were downloaded
RUN ls -la /opt/flink/lib/paimon-* /opt/flink/lib/flink-shaded-hadoop-*
RUN ls -la /opt/flink/lib/paimon-* /opt/flink/lib/flink-shaded-hadoop-*

# Ensure Prometheus plugin is properly set up
RUN chown -R flink:flink /opt/flink/plugins/metrics-prometheus/
4 changes: 4 additions & 0 deletions conf/flink-conf.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
blob.server.port: 6124
s3.path-style-access: true
taskmanager.memory.process.size: 1728m
metrics.reporter.prom.port: 9249
logger.kafka.level: INFO
jobmanager.rpc.address: jobmanager
web.cancel.enable: true
Expand All @@ -16,4 +17,7 @@ s3.endpoint: http://minio:9000
parallelism.default: 1
taskmanager.numberOfTaskSlots: 4
web.submit.enable: true
metrics.reporters: prom
logger.zookeeper.level: INFO
metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter
metrics.reporter.prom.host: 0.0.0.0
41 changes: 41 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ services:
environment:
MINIO_ROOT_USER: admin
MINIO_ROOT_PASSWORD: password123
MINIO_PROMETHEUS_AUTH_TYPE: public
command: server /data --console-address ":9001"
volumes:
- minio_data:/data
Expand Down Expand Up @@ -71,6 +72,7 @@ services:
command: jobmanager
ports:
- "8083:8081" # Flink Web UI
- "9249:9249" # Prometheus metrics
environment:
- |
FLINK_PROPERTIES=
Expand All @@ -89,6 +91,8 @@ services:
build: .
container_name: flink-taskmanager
command: taskmanager
ports:
- "9250:9249" # Prometheus metrics (different port to avoid conflict)
depends_on:
jobmanager:
condition: service_healthy
Expand All @@ -100,9 +104,46 @@ services:
volumes:
- ./conf:/opt/flink/conf

# Prometheus for metrics collection
prometheus:
image: prom/prometheus:v2.45.0
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'

# Grafana for visualization
grafana:
image: grafana/grafana:10.0.0
container_name: grafana
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
depends_on:
- prometheus

volumes:
minio_data:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local

networks:
default:
Expand Down
12 changes: 12 additions & 0 deletions monitoring/grafana/provisioning/dashboards/dashboards.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: 1

providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards
73 changes: 73 additions & 0 deletions monitoring/grafana/provisioning/dashboards/flink-dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"id": null,
"title": "Flink Monitoring",
"tags": ["flink"],
"style": "dark",
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Job Manager CPU Usage",
"type": "stat",
"targets": [
{
"expr": "flink_jobmanager_Status_JVM_CPU_Load",
"refId": "A"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "percent"
}
}
},
{
"id": 2,
"title": "Task Manager CPU Usage",
"type": "stat",
"targets": [
{
"expr": "flink_taskmanager_Status_JVM_CPU_Load",
"refId": "A"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "percent"
}
}
},
{
"id": 3,
"title": "Memory Usage",
"type": "timeseries",
"targets": [
{
"expr": "flink_jobmanager_Status_JVM_Memory_Heap_Used",
"refId": "A",
"legendFormat": "JobManager Heap"
},
{
"expr": "flink_taskmanager_Status_JVM_Memory_Heap_Used",
"refId": "B",
"legendFormat": "TaskManager Heap"
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8},
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
}
}
],
"schemaVersion": 27,
"version": 1
}
103 changes: 103 additions & 0 deletions monitoring/grafana/provisioning/dashboards/minio-dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"id": null,
"title": "MinIO Monitoring",
"tags": ["minio"],
"style": "dark",
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Total Storage",
"type": "stat",
"targets": [
{
"expr": "minio_cluster_capacity_raw_total_bytes",
"refId": "A"
}
],
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
}
},
{
"id": 2,
"title": "Used Storage",
"type": "stat",
"targets": [
{
"expr": "minio_cluster_capacity_usable_total_bytes - minio_cluster_capacity_usable_free_bytes",
"refId": "A"
}
],
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
}
},
{
"id": 3,
"title": "Online Servers",
"type": "stat",
"targets": [
{
"expr": "minio_cluster_nodes_online_total",
"refId": "A"
}
],
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "short"
}
}
},
{
"id": 4,
"title": "Request Rate",
"type": "timeseries",
"targets": [
{
"expr": "rate(minio_s3_requests_total[5m])",
"refId": "A",
"legendFormat": "{{api}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 5,
"title": "Error Rate",
"type": "timeseries",
"targets": [
{
"expr": "rate(minio_s3_requests_errors_total[5m])",
"refId": "A",
"legendFormat": "{{api}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
}
],
"schemaVersion": 27,
"version": 1
}
9 changes: 9 additions & 0 deletions monitoring/grafana/provisioning/datasources/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: 1

datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
28 changes: 28 additions & 0 deletions monitoring/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']

# MinIO metrics
- job_name: 'minio'
metrics_path: /minio/v2/metrics/cluster
static_configs:
- targets: ['minio:9000']
scrape_interval: 30s

# Flink JobManager metrics
- job_name: 'flink-jobmanager'
static_configs:
- targets: ['jobmanager:9249']
scrape_interval: 30s

# Flink TaskManager metrics
- job_name: 'flink-taskmanager'
static_configs:
- targets: ['taskmanager:9249']
scrape_interval: 30s