From 3e03116a64486afa069cd5e645e3413e6e14ca8e Mon Sep 17 00:00:00 2001 From: nabil salah Date: Thu, 3 Jul 2025 11:54:27 +0300 Subject: [PATCH] feat: implement uptime scoring for nodes Signed-off-by: nabil salah --- grid-proxy/internal/explorer/converters.go | 2 + .../internal/explorer/db/indexer_calls.go | 57 ++++++++++++++++++- grid-proxy/internal/explorer/db/postgres.go | 1 + grid-proxy/internal/explorer/db/types.go | 1 + grid-proxy/pkg/types/indexer.go | 7 ++- grid-proxy/pkg/types/nodes.go | 2 + .../tests/queries/mock_client/loader.go | 8 ++- grid-proxy/tests/queries/mock_client/nodes.go | 24 ++++---- grid-proxy/tools/db/crafter/generator.go | 7 ++- grid-proxy/tools/db/schema.sql | 3 + 10 files changed, 91 insertions(+), 21 deletions(-) diff --git a/grid-proxy/internal/explorer/converters.go b/grid-proxy/internal/explorer/converters.go index 7f2c04f86..d21501dae 100644 --- a/grid-proxy/internal/explorer/converters.go +++ b/grid-proxy/internal/explorer/converters.go @@ -63,6 +63,7 @@ func nodeFromDBNode(info db.Node) types.Node { NumGPU: info.NumGPU, ExtraFee: info.ExtraFee, Healthy: info.Healthy, + UptimeScore: info.UptimeScore, Dmi: types.Dmi{ Processor: info.Processor, Memory: info.Memory, @@ -169,6 +170,7 @@ func nodeWithNestedCapacityFromDBNode(info db.Node) types.NodeWithNestedCapacity NumGPU: info.NumGPU, ExtraFee: info.ExtraFee, Healthy: info.Healthy, + UptimeScore: info.UptimeScore, Dmi: types.Dmi{ Processor: info.Processor, Memory: info.Memory, diff --git a/grid-proxy/internal/explorer/db/indexer_calls.go b/grid-proxy/internal/explorer/db/indexer_calls.go index 4561d58e1..fa4907989 100644 --- a/grid-proxy/internal/explorer/db/indexer_calls.go +++ b/grid-proxy/internal/explorer/db/indexer_calls.go @@ -38,9 +38,15 @@ func (p *PostgresDatabase) UpsertNodesGPU(ctx context.Context, gpus []types.Node } func (p *PostgresDatabase) UpsertNodeHealth(ctx context.Context, healthReports []types.HealthReport) error { + // Calculate uptime scores for each health report + for i := range healthReports { + uptimeScore := p.calculateUptimeScore(ctx, healthReports[i]) + healthReports[i].UptimeScore = uptimeScore + } + conflictClause := clause.OnConflict{ Columns: []clause.Column{{Name: "node_twin_id"}}, - DoUpdates: clause.AssignmentColumns([]string{"healthy", "updated_at"}), + DoUpdates: clause.AssignmentColumns([]string{"healthy", "uptime_score", "updated_at"}), } return p.gormDB.WithContext(ctx).Table("health_report").Clauses(conflictClause).Create(&healthReports).Error } @@ -111,3 +117,52 @@ func (p *PostgresDatabase) UpsertNodeLocation(ctx context.Context, locations []t } return p.gormDB.WithContext(ctx).Table("node_location").Clauses(conflictClause).Create(&locations).Error } + +func (p *PostgresDatabase) calculateUptimeScore(ctx context.Context, healthReport types.HealthReport) float64 { + const thirtyDaysInSeconds = 30 * 24 * 60 + const intervalsInThirtyDays = 30 * 24 * 60 / 5 // 30 days in minutes, divided by 5 minutes intervals + + now := healthReport.UpdatedAt + thirtyDaysAgo := now - thirtyDaysInSeconds + + newValue := 0.0 + if healthReport.Healthy { + newValue = 1.0 + } + var previousReport types.HealthReport + previousTotal := 0.0 + err := p.gormDB.WithContext(ctx).Table("health_report"). + Where("node_twin_id = ?", healthReport.NodeTwinId). + Last(&previousReport).Error + + // If no previous report exists, we assume the previous total is 0 + if err != nil { + return newValue / intervalsInThirtyDays + } + previousTotal = previousReport.UptimeScore * intervalsInThirtyDays + + // Get old value from 30 days ago (±1 minute) from history + oldValue := 0.0 + var oldReport types.HealthReport + startTime := thirtyDaysAgo - 60 + endTime := thirtyDaysAgo + 60 + + err = p.gormDB.WithContext(ctx).Table("health_report"). + Where("node_twin_id = ? AND updated_at BETWEEN ? AND ?", healthReport.NodeTwinId, startTime, endTime). + Order("updated_at DESC"). + First(&oldReport).Error + + if err == nil { + if oldReport.Healthy { + oldValue = 1.0 + } + } + + totalHealthyIntervals := previousTotal + newValue - oldValue + + if totalHealthyIntervals < 0 { + totalHealthyIntervals = 0 + } + + return totalHealthyIntervals / intervalsInThirtyDays +} diff --git a/grid-proxy/internal/explorer/db/postgres.go b/grid-proxy/internal/explorer/db/postgres.go index ee617a6dd..e9bbe05f9 100644 --- a/grid-proxy/internal/explorer/db/postgres.go +++ b/grid-proxy/internal/explorer/db/postgres.go @@ -376,6 +376,7 @@ func (d *PostgresDatabase) nodeTableQuery(ctx context.Context, filter types.Node "resources_cache.node_gpu_count AS num_gpu", "resources_cache.gpus", "health_report.healthy", + "health_report.uptime_score", "node_ipv6.has_ipv6", "node_features.features as features", "resources_cache.bios", diff --git a/grid-proxy/internal/explorer/db/types.go b/grid-proxy/internal/explorer/db/types.go index 6b2df569b..58d2876fc 100644 --- a/grid-proxy/internal/explorer/db/types.go +++ b/grid-proxy/internal/explorer/db/types.go @@ -106,6 +106,7 @@ type Node struct { ExtraFee uint64 NodeContractsCount uint64 `gorm:"node_contracts_count"` Healthy bool + UptimeScore float64 `gorm:"uptime_score"` Bios types.BIOS `gorm:"type:jsonb;serializer:json"` Baseboard types.Baseboard `gorm:"type:jsonb;serializer:json"` Memory []types.Memory `gorm:"type:jsonb;serializer:json"` diff --git a/grid-proxy/pkg/types/indexer.go b/grid-proxy/pkg/types/indexer.go index f4a125a24..f169cb4a2 100644 --- a/grid-proxy/pkg/types/indexer.go +++ b/grid-proxy/pkg/types/indexer.go @@ -18,9 +18,10 @@ func (NodeGPU) TableName() string { // HealthReport holds the state of node healthiness // used as gorm model type HealthReport struct { - NodeTwinId uint32 `gorm:"unique;not null"` - Healthy bool - UpdatedAt int64 + NodeTwinId uint32 `gorm:"unique;not null"` + Healthy bool + UptimeScore float64 `gorm:"default:0"` // Uptime score calculated using 30-day sliding window + UpdatedAt int64 } func (HealthReport) TableName() string { diff --git a/grid-proxy/pkg/types/nodes.go b/grid-proxy/pkg/types/nodes.go index 2784bafc9..b280e4f01 100644 --- a/grid-proxy/pkg/types/nodes.go +++ b/grid-proxy/pkg/types/nodes.go @@ -48,6 +48,7 @@ type Node struct { NumGPU int `json:"num_gpu" sort:"num_gpu"` ExtraFee uint64 `json:"extraFee" sort:"extra_fee"` Healthy bool `json:"healthy"` + UptimeScore float64 `json:"uptime_score"` Dmi Dmi `json:"dmi"` Speed Speed `json:"speed"` CpuBenchmark CpuBenchmark `json:"cpu_benchmark"` @@ -94,6 +95,7 @@ type NodeWithNestedCapacity struct { NumGPU int `json:"num_gpu"` ExtraFee uint64 `json:"extraFee"` Healthy bool `json:"healthy"` + UptimeScore float64 `json:"uptime_score"` Dmi Dmi `json:"dmi"` Speed Speed `json:"speed"` CpuBenchmark CpuBenchmark `json:"cpu_benchmark"` diff --git a/grid-proxy/tests/queries/mock_client/loader.go b/grid-proxy/tests/queries/mock_client/loader.go index 04b1520f4..a59ab82e3 100644 --- a/grid-proxy/tests/queries/mock_client/loader.go +++ b/grid-proxy/tests/queries/mock_client/loader.go @@ -38,7 +38,7 @@ type DBData struct { GPUs map[uint32][]types.NodeGPU Regions map[string]string Locations map[string]Location - HealthReports map[uint32]bool + HealthReports map[uint32]types.HealthReport NodeIpv6 map[uint32]bool NodeFeatures map[uint32][]string DMIs map[uint32]types.Dmi @@ -514,6 +514,7 @@ func loadHealthReports(db *sql.DB, data *DBData) error { SELECT COALESCE(node_twin_id, 0), COALESCE(healthy, false), + COALESCE(uptime_score, 0.0), COALESCE(updated_at, 0) FROM health_report;`) @@ -525,11 +526,12 @@ func loadHealthReports(db *sql.DB, data *DBData) error { if err := rows.Scan( &health.NodeTwinId, &health.Healthy, + &health.UptimeScore, &health.UpdatedAt, ); err != nil { return err } - data.HealthReports[health.NodeTwinId] = health.Healthy + data.HealthReports[health.NodeTwinId] = health } return nil @@ -796,7 +798,7 @@ func Load(db *sql.DB, gormDB *gorm.DB) (DBData, error) { FarmHasRentedNode: make(map[uint64]map[uint64]bool), Regions: make(map[string]string), Locations: make(map[string]Location), - HealthReports: make(map[uint32]bool), + HealthReports: make(map[uint32]types.HealthReport), DMIs: make(map[uint32]types.Dmi), Speeds: make(map[uint32]types.Speed), CpuBenchmarks: make(map[uint32]types.CpuBenchmark), diff --git a/grid-proxy/tests/queries/mock_client/nodes.go b/grid-proxy/tests/queries/mock_client/nodes.go index 91ea3599c..dc38c650b 100644 --- a/grid-proxy/tests/queries/mock_client/nodes.go +++ b/grid-proxy/tests/queries/mock_client/nodes.go @@ -179,11 +179,12 @@ func (g *GridProxyMockClient) Nodes(ctx context.Context, filter types.NodeFilter State: node.Power.State, Target: node.Power.Target, }, - NumGPU: numGPU, - GPUs: getGpus(g.data, uint32(node.TwinID)), - ExtraFee: node.ExtraFee, - Healthy: g.data.HealthReports[uint32(node.TwinID)], - Dmi: g.data.DMIs[uint32(node.TwinID)], + NumGPU: numGPU, + GPUs: getGpus(g.data, uint32(node.TwinID)), + ExtraFee: node.ExtraFee, + Healthy: g.data.HealthReports[uint32(node.TwinID)].Healthy, + UptimeScore: g.data.HealthReports[uint32(node.TwinID)].UptimeScore, + Dmi: g.data.DMIs[uint32(node.TwinID)], Speed: types.Speed{ Upload: g.data.Speeds[uint32(node.TwinID)].Upload, Download: g.data.Speeds[uint32(node.TwinID)].Download, @@ -289,11 +290,12 @@ func (g *GridProxyMockClient) Node(ctx context.Context, nodeID uint32) (res type State: node.Power.State, Target: node.Power.Target, }, - NumGPU: numGPU, - GPUs: getGpus(g.data, uint32(node.TwinID)), - ExtraFee: node.ExtraFee, - Healthy: g.data.HealthReports[uint32(node.TwinID)], - Dmi: g.data.DMIs[uint32(node.TwinID)], + NumGPU: numGPU, + GPUs: getGpus(g.data, uint32(node.TwinID)), + ExtraFee: node.ExtraFee, + Healthy: g.data.HealthReports[uint32(node.TwinID)].Healthy, + UptimeScore: g.data.HealthReports[uint32(node.TwinID)].UptimeScore, + Dmi: g.data.DMIs[uint32(node.TwinID)], Speed: types.Speed{ Upload: g.data.Speeds[uint32(node.TwinID)].Upload, Download: g.data.Speeds[uint32(node.TwinID)].Download, @@ -354,7 +356,7 @@ func (n *Node) satisfies(f types.NodeFilter, data *DBData) bool { return false } - if f.Healthy != nil && *f.Healthy != data.HealthReports[uint32(n.TwinID)] { + if f.Healthy != nil && *f.Healthy != data.HealthReports[uint32(n.TwinID)].Healthy { return false } diff --git a/grid-proxy/tools/db/crafter/generator.go b/grid-proxy/tools/db/crafter/generator.go index 4df07c3ad..6925bf592 100644 --- a/grid-proxy/tools/db/crafter/generator.go +++ b/grid-proxy/tools/db/crafter/generator.go @@ -901,9 +901,10 @@ func (c *Crafter) GenerateHealthReports() error { } healthReport := types.HealthReport{ - NodeTwinId: uint32(nodeTwinsStart + i), - Healthy: health, - UpdatedAt: time.Now().Unix(), + NodeTwinId: uint32(nodeTwinsStart + i), + Healthy: health, + UptimeScore: rand.Float64(), // Random uptime score between 0.0 and 1.0 + UpdatedAt: time.Now().Unix(), } healthReports = append(healthReports, healthReport) } diff --git a/grid-proxy/tools/db/schema.sql b/grid-proxy/tools/db/schema.sql index de5d230a7..334cac9ca 100644 --- a/grid-proxy/tools/db/schema.sql +++ b/grid-proxy/tools/db/schema.sql @@ -1024,6 +1024,9 @@ CREATE TABLE IF NOT EXISTS public.health_report ( updated_at bigint ); +ALTER TABLE public.health_report + ADD COLUMN IF NOT EXISTS uptime_score double precision DEFAULT 0; + ALTER TABLE public.health_report OWNER TO postgres;