From 96dd3a3079edc4b6a26859ef6f0b30ec7caeedb5 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Tue, 23 Sep 2025 10:46:12 +0200 Subject: [PATCH 1/3] refactor: Always return errors in JSON for CEEMS LB * Reuse the custom errors of CEEMS API server in CEEMS LB to return any errors in JSON. This will show the returned errors properly in Grafana helping users and operators to find issues. * Update e2e test fixtures Signed-off-by: Mahendra Paipuri --- pkg/api/http/docs/docs.go | 22 +++--- pkg/api/http/docs/swagger.json | 22 +++--- pkg/api/http/docs/swagger.yaml | 22 +++--- pkg/api/http/error.go | 73 ++++++++++--------- pkg/api/http/error_test.go | 2 +- pkg/api/http/middleware.go | 4 +- pkg/api/http/server.go | 47 ++++++------ ...st-api-server-global-stats-admin-query.txt | 2 +- .../output/e2e-test-api-verify-fail-query.txt | 2 +- pkg/lb/frontend/frontend.go | 12 ++- pkg/lb/frontend/middleware.go | 67 +++-------------- .../e2e-test-lb-forbid-user-query-api.txt | 4 +- .../e2e-test-lb-forbid-user-query-db.txt | 4 +- .../output/e2e-test-lb-server-pyro-only.txt | 2 +- .../e2e-test-lb-server-tsdb-pyro-mix.txt | 6 +- pkg/lb/testdata/output/e2e-test-lb-server.txt | 4 +- 16 files changed, 127 insertions(+), 168 deletions(-) diff --git a/pkg/api/http/docs/docs.go b/pkg/api/http/docs/docs.go index 25c810fe..77924424 100644 --- a/pkg/api/http/docs/docs.go +++ b/pkg/api/http/docs/docs.go @@ -1267,17 +1267,17 @@ const docTemplate = `{ "not_acceptable" ], "x-enum-varnames": [ - "errorNone", - "errorUnauthorized", - "errorForbidden", - "errorTimeout", - "errorCanceled", - "errorExec", - "errorBadData", - "errorInternal", - "errorUnavailable", - "errorNotFound", - "errorNotAcceptable" + "ErrorNone", + "ErrorUnauthorized", + "ErrorForbidden", + "ErrorTimeout", + "ErrorCanceled", + "ErrorExec", + "ErrorBadData", + "ErrorInternal", + "ErrorUnavailable", + "ErrorNotFound", + "ErrorNotAcceptable" ] }, "models.Cluster": { diff --git a/pkg/api/http/docs/swagger.json b/pkg/api/http/docs/swagger.json index 7893ae50..f0b64002 100644 --- a/pkg/api/http/docs/swagger.json +++ b/pkg/api/http/docs/swagger.json @@ -1264,17 +1264,17 @@ "not_acceptable" ], "x-enum-varnames": [ - "errorNone", - "errorUnauthorized", - "errorForbidden", - "errorTimeout", - "errorCanceled", - "errorExec", - "errorBadData", - "errorInternal", - "errorUnavailable", - "errorNotFound", - "errorNotAcceptable" + "ErrorNone", + "ErrorUnauthorized", + "ErrorForbidden", + "ErrorTimeout", + "ErrorCanceled", + "ErrorExec", + "ErrorBadData", + "ErrorInternal", + "ErrorUnavailable", + "ErrorNotFound", + "ErrorNotAcceptable" ] }, "models.Cluster": { diff --git a/pkg/api/http/docs/swagger.yaml b/pkg/api/http/docs/swagger.yaml index ac98fc2a..dcc58d3e 100644 --- a/pkg/api/http/docs/swagger.yaml +++ b/pkg/api/http/docs/swagger.yaml @@ -196,17 +196,17 @@ definitions: - not_acceptable type: string x-enum-varnames: - - errorNone - - errorUnauthorized - - errorForbidden - - errorTimeout - - errorCanceled - - errorExec - - errorBadData - - errorInternal - - errorUnavailable - - errorNotFound - - errorNotAcceptable + - ErrorNone + - ErrorUnauthorized + - ErrorForbidden + - ErrorTimeout + - ErrorCanceled + - ErrorExec + - ErrorBadData + - ErrorInternal + - ErrorUnavailable + - ErrorNotFound + - ErrorNotAcceptable models.Cluster: properties: id: diff --git a/pkg/api/http/error.go b/pkg/api/http/error.go index a3eddbd0..205b4a29 100644 --- a/pkg/api/http/error.go +++ b/pkg/api/http/error.go @@ -20,28 +20,28 @@ var ( type errorType string // Error response. -type apiError struct { - typ errorType - err error +type APIError struct { + Typ errorType + Err error } -func (e *apiError) Error() string { - return fmt.Sprintf("%s: %s", e.typ, e.err) +func (e *APIError) Error() string { + return fmt.Sprintf("%s: %s", e.Typ, e.Err) } // List of predefined errors. const ( - errorNone errorType = "" - errorUnauthorized errorType = "unauthorized" - errorForbidden errorType = "forbidden" - errorTimeout errorType = "timeout" - errorCanceled errorType = "canceled" - errorExec errorType = "execution" - errorBadData errorType = "bad_data" - errorInternal errorType = "internal" - errorUnavailable errorType = "unavailable" - errorNotFound errorType = "not_found" - errorNotAcceptable errorType = "not_acceptable" + ErrorNone errorType = "" + ErrorUnauthorized errorType = "unauthorized" + ErrorForbidden errorType = "forbidden" + ErrorTimeout errorType = "timeout" + ErrorCanceled errorType = "canceled" + ErrorExec errorType = "execution" + ErrorBadData errorType = "bad_data" + ErrorInternal errorType = "internal" + ErrorUnavailable errorType = "unavailable" + ErrorNotFound errorType = "not_found" + ErrorNotAcceptable errorType = "not_acceptable" ) // Custom error codes. @@ -53,36 +53,39 @@ const ( // Custom errors. var ( - errNoUser = errors.New("no user identified") - errNoPrivs = errors.New("current user does not have admin privileges") - errInvalidRequest = errors.New("invalid request") - errInvalidQueryField = errors.New("invalid query fields") - errMissingUUIDs = errors.New("uuids missing in the request") - errNoAuth = errors.New("user do not have permissions on uuids") + ErrNoUser = errors.New("no user identified") + ErrNoPrivs = errors.New("current user does not have admin privileges") + ErrInvalidRequest = errors.New("invalid request") + ErrInvalidQueryField = errors.New("invalid query fields") + ErrMissingData = errors.New("missing data in the request") + ErrNoAuth = errors.New("user do not have permissions to view metrics of this job/pod/vm") + ErrNoAccess = errors.New("user do not have permissions to access this resource") + ErrInvalidClusterID = errors.New("invalid ceems cluster id") + ErrUnavailable = errors.New("tsdb/pyroscope unavailable") ) // Return error response for by setting errorString and errorType in response. -func errorResponse[T any](w http.ResponseWriter, apiErr *apiError, logger *slog.Logger, data []T) { +func ErrorResponse[T any](w http.ResponseWriter, apiErr *APIError, logger *slog.Logger, data []T) { var code int - switch apiErr.typ { //nolint:exhaustive - case errorBadData: + switch apiErr.Typ { //nolint:exhaustive + case ErrorBadData, errorType(ErrNoUser.Error()), errorType(ErrInvalidRequest.Error()), errorType(ErrInvalidQueryField.Error()), errorType(ErrMissingData.Error()): code = http.StatusBadRequest - case errorUnauthorized: + case ErrorUnauthorized, errorType(ErrNoPrivs.Error()): code = http.StatusUnauthorized - case errorForbidden: + case ErrorForbidden, errorType(ErrNoAuth.Error()), errorType(ErrNoAccess.Error()): code = http.StatusForbidden - case errorExec: + case ErrorExec: code = http.StatusUnprocessableEntity - case errorCanceled: + case ErrorCanceled: code = statusClientClosedConnection - case errorTimeout: + case ErrorTimeout, ErrorUnavailable, errorType(ErrUnavailable.Error()): code = http.StatusServiceUnavailable - case errorInternal: + case ErrorInternal: code = http.StatusInternalServerError - case errorNotFound: + case ErrorNotFound: code = http.StatusNotFound - case errorNotAcceptable: + case ErrorNotAcceptable: code = http.StatusNotAcceptable default: code = http.StatusInternalServerError @@ -92,8 +95,8 @@ func errorResponse[T any](w http.ResponseWriter, apiErr *apiError, logger *slog. response := Response[T]{ Status: "error", - ErrorType: apiErr.typ, - Error: apiErr.err.Error(), + ErrorType: apiErr.Typ, + Error: apiErr.Err.Error(), Data: data, } diff --git a/pkg/api/http/error_test.go b/pkg/api/http/error_test.go index 327c09c1..c366af81 100644 --- a/pkg/api/http/error_test.go +++ b/pkg/api/http/error_test.go @@ -11,6 +11,6 @@ import ( ) func TestApiError(t *testing.T) { - e := apiError{typ: errorBadData, err: errors.New("bad data")} + e := APIError{Typ: ErrorBadData, Err: errors.New("bad data")} assert.Equal(t, "bad_data: bad data", e.Error()) } diff --git a/pkg/api/http/middleware.go b/pkg/api/http/middleware.go index 989f893a..bdab1847 100644 --- a/pkg/api/http/middleware.go +++ b/pkg/api/http/middleware.go @@ -90,7 +90,7 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler amw.logger.Error("User Header not found. Denying authentication") // Write an error and stop the handler chain - errorResponse[any](w, &apiError{errorUnauthorized, errNoUser}, amw.logger, nil) + ErrorResponse[any](w, &APIError{ErrorUnauthorized, ErrNoUser}, amw.logger, nil) return } @@ -121,7 +121,7 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler amw.logger.Error("Unprivileged user accessing admin resource", "logger_user", loggedUser, "url", r.URL) // Write an error and stop the handler chain - errorResponse[any](w, &apiError{errorForbidden, errNoPrivs}, amw.logger, nil) + ErrorResponse[any](w, &APIError{ErrorForbidden, ErrNoPrivs}, amw.logger, nil) return } diff --git a/pkg/api/http/server.go b/pkg/api/http/server.go index 07352872..f43903e9 100644 --- a/pkg/api/http/server.go +++ b/pkg/api/http/server.go @@ -724,8 +724,8 @@ func (s *CEEMSServer) unitsQuerier( // Get fields query parameters if any queriedFields := s.getQueriedFields(r.URL.Query(), base.UnitsDBTableColNames) if len(queriedFields) == 0 { - s.logger.Error("Invalid query fields", "logged_user", loggedUser, "err", errInvalidQueryField) - errorResponse[any](w, &apiError{errorBadData, errInvalidQueryField}, s.logger, nil) + s.logger.Error("Invalid query fields", "logged_user", loggedUser, "err", ErrInvalidQueryField) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidQueryField}, s.logger, nil) return } @@ -768,7 +768,7 @@ func (s *CEEMSServer) unitsQuerier( // Get query window time stamps timeQuery, err = s.getQueryWindow(r, "ended_at", running, false) if err != nil { - errorResponse[any](w, &apiError{errorBadData, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, err}, s.logger, nil) return } @@ -785,7 +785,7 @@ queryUnits: units, err := s.queriers.unit(r.Context(), s.db, q, s.logger) if units == nil && err != nil { s.logger.Error("Failed to fetch units", "logged_user", loggedUser, "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -977,7 +977,8 @@ func (s *CEEMSServer) verifyUnitsOwnership(w http.ResponseWriter, r *http.Reques // Get list of queried uuids uuids := r.URL.Query()["uuid"] if len(uuids) == 0 { - errorResponse[any](w, &apiError{errorBadData, errMissingUUIDs}, s.logger, nil) + s.logger.Error("uuids not found in query parameters") + ErrorResponse[any](w, &APIError{ErrorBadData, ErrMissingData}, s.logger, nil) return } @@ -996,7 +997,7 @@ func (s *CEEMSServer) verifyUnitsOwnership(w http.ResponseWriter, r *http.Reques w.Write([]byte("KO")) } } else { - errorResponse[any](w, &apiError{errorForbidden, errNoAuth}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorForbidden, ErrNoAuth}, s.logger, nil) } } @@ -1045,7 +1046,7 @@ func (s *CEEMSServer) clustersAdmin(w http.ResponseWriter, r *http.Request) { clusterIDs, err := s.queriers.cluster(r.Context(), s.db, q, s.logger) if clusterIDs == nil && err != nil { s.logger.Error("Failed to fetch cluster IDs", "user", loggedUser, "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1081,7 +1082,7 @@ func (s *CEEMSServer) adminUsersQuerier(w http.ResponseWriter, r *http.Request) adminUsersLists, err := s.queriers.adminUser(r.Context(), s.db, q, s.logger) if adminUsersLists == nil && err != nil { s.logger.Error("Failed to fetch admin user details", "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1134,7 +1135,7 @@ func (s *CEEMSServer) usersQuerier(users []string, w http.ResponseWriter, r *htt userModels, err := s.queriers.user(r.Context(), s.db, q, s.logger) if userModels == nil && err != nil { s.logger.Error("Failed to fetch user details", "users", strings.Join(users, ","), "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1281,7 +1282,7 @@ func (s *CEEMSServer) projectsQuerier(users []string, w http.ResponseWriter, r * "Failed to fetch project details", "users", strings.Join(users, ","), "err", err, ) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1451,7 +1452,7 @@ func (s *CEEMSServer) currentUsage(users []string, fields []string, w http.Respo // Round `to` and `from` query parameters to cacheTTL err = s.roundQueryWindow(r) if err != nil { - errorResponse[any](w, &apiError{errorBadData, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, err}, s.logger, nil) return } @@ -1463,7 +1464,7 @@ func (s *CEEMSServer) currentUsage(users []string, fields []string, w http.Respo // Get query window time stamps timeQuery, err = s.getQueryWindow(r, "last_updated_at", false, terminated) if err != nil { - errorResponse[any](w, &apiError{errorBadData, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, err}, s.logger, nil) return } @@ -1601,7 +1602,7 @@ func (s *CEEMSServer) currentUsage(users []string, fields []string, w http.Respo usage, err = s.queriers.usage(r.Context(), s.db, q, s.logger) if usage == nil && err != nil { s.logger.Error("Failed to fetch current usage statistics", "users", strings.Join(users, ","), "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1658,7 +1659,7 @@ func (s *CEEMSServer) globalUsage(users []string, queriedFields []string, w http usage, err := s.queriers.usage(r.Context(), s.db, q, s.logger) if usage == nil && err != nil { s.logger.Error("Failed to fetch global usage statistics", "users", strings.Join(users, ","), "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1754,7 +1755,7 @@ func (s *CEEMSServer) usage(w http.ResponseWriter, r *http.Request) { var exists bool if mode, exists = mux.Vars(r)["mode"]; !exists { - errorResponse[any](w, &apiError{errorBadData, errInvalidRequest}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidRequest}, s.logger, nil) return } @@ -1763,7 +1764,7 @@ func (s *CEEMSServer) usage(w http.ResponseWriter, r *http.Request) { queriedFields := s.getQueriedFields(r.URL.Query(), base.UsageDBTableColNames) if len(queriedFields) == 0 { s.logger.Error("Invalid query fields", "logged_user", loggedUser) - errorResponse[any](w, &apiError{errorBadData, errInvalidQueryField}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidQueryField}, s.logger, nil) return } @@ -1857,7 +1858,7 @@ func (s *CEEMSServer) usageAdmin(w http.ResponseWriter, r *http.Request) { var exists bool if mode, exists = mux.Vars(r)["mode"]; !exists { - errorResponse[any](w, &apiError{errorBadData, errInvalidRequest}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidRequest}, s.logger, nil) return } @@ -1866,7 +1867,7 @@ func (s *CEEMSServer) usageAdmin(w http.ResponseWriter, r *http.Request) { queriedFields := s.getQueriedFields(r.URL.Query(), base.UsageDBTableColNames) if len(queriedFields) == 0 { s.logger.Error("Invalid query fields", "logged_user", loggedUser) - errorResponse[any](w, &apiError{errorBadData, errInvalidQueryField}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidQueryField}, s.logger, nil) return } @@ -1903,7 +1904,7 @@ func (s *CEEMSServer) currentStats(users []string, w http.ResponseWriter, r *htt // Get query window time stamps timeQuery, err = s.getQueryWindow(r, "ended_at", true, false) if err != nil { - errorResponse[any](w, &apiError{errorBadData, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, err}, s.logger, nil) return } @@ -1928,7 +1929,7 @@ func (s *CEEMSServer) currentStats(users []string, w http.ResponseWriter, r *htt stats, err = s.queriers.stat(r.Context(), s.db, q, s.logger) if stats == nil && err != nil { s.logger.Error("Failed to fetch current quick stats", "users", strings.Join(users, ","), "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -1983,7 +1984,7 @@ func (s *CEEMSServer) globalStats(users []string, w http.ResponseWriter, r *http stats, err = s.queriers.stat(r.Context(), s.db, q, s.logger) if stats == nil && err != nil { s.logger.Error("Failed to fetch global quick stats", "users", strings.Join(users, ","), "err", err) - errorResponse[any](w, &apiError{errorInternal, err}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorInternal, err}, s.logger, nil) return } @@ -2058,7 +2059,7 @@ func (s *CEEMSServer) statsAdmin(w http.ResponseWriter, r *http.Request) { var exists bool if mode, exists = mux.Vars(r)["mode"]; !exists { - errorResponse[any](w, &apiError{errorBadData, errInvalidRequest}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidRequest}, s.logger, nil) return } @@ -2113,7 +2114,7 @@ func (s *CEEMSServer) demo(w http.ResponseWriter, r *http.Request) { var exists bool if resourceType, exists = mux.Vars(r)["resource"]; !exists { - errorResponse[any](w, &apiError{errorBadData, errInvalidRequest}, s.logger, nil) + ErrorResponse[any](w, &APIError{ErrorBadData, ErrInvalidRequest}, s.logger, nil) return } diff --git a/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt b/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt index 2990fd8d..c903dcec 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"cluster_id":"k8s-0","resource_manager":"k8s","num_units":4,"num_inactive_units":1,"num_active_units":3,"num_projects":3,"num_users":4},{"cluster_id":"k8s-1","resource_manager":"k8s","num_units":4,"num_inactive_units":1,"num_active_units":3,"num_projects":3,"num_users":4},{"cluster_id":"os-0","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5},{"cluster_id":"os-1","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5},{"cluster_id":"slurm-0","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7}]} +{"status":"success","data":[{"cluster_id":"k8s-0","resource_manager":"k8s","num_units":3,"num_inactive_units":1,"num_active_units":2,"num_projects":2,"num_users":3},{"cluster_id":"k8s-1","resource_manager":"k8s","num_units":3,"num_inactive_units":1,"num_active_units":2,"num_projects":2,"num_users":3},{"cluster_id":"os-0","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5},{"cluster_id":"os-1","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5},{"cluster_id":"slurm-0","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7}]} diff --git a/pkg/api/testdata/output/e2e-test-api-verify-fail-query.txt b/pkg/api/testdata/output/e2e-test-api-verify-fail-query.txt index 01099469..83add0d2 100644 --- a/pkg/api/testdata/output/e2e-test-api-verify-fail-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-verify-fail-query.txt @@ -1 +1 @@ -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions on uuids"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view metrics of this job/pod/vm"} diff --git a/pkg/lb/frontend/frontend.go b/pkg/lb/frontend/frontend.go index 1ee5b5b0..c490a274 100644 --- a/pkg/lb/frontend/frontend.go +++ b/pkg/lb/frontend/frontend.go @@ -16,6 +16,7 @@ import ( ceems_api_base "github.com/ceems-dev/ceems/pkg/api/base" ceems_api_cli "github.com/ceems-dev/ceems/pkg/api/cli" + ceems_api_http "github.com/ceems-dev/ceems/pkg/api/http" "github.com/ceems-dev/ceems/pkg/api/models" "github.com/ceems-dev/ceems/pkg/lb/base" "github.com/ceems-dev/ceems/pkg/lb/serverpool" @@ -165,7 +166,8 @@ func (lb *loadBalancer) Serve(w http.ResponseWriter, r *http.Request) { // Check if queryParams is nil which could happen in edge cases if queryParams == nil { - http.Error(w, "Query parameters not found", http.StatusBadRequest) + lb.logger.Error("Query parameters not found", "err", ceems_api_http.ErrMissingData) + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorBadData, Err: ceems_api_http.ErrMissingData}, lb.logger, nil) return } @@ -176,7 +178,8 @@ func (lb *loadBalancer) Serve(w http.ResponseWriter, r *http.Request) { if v, ok := queryParams.(*ReqParams); ok { id = v.clusterID } else { - http.Error(w, "Invalid query parameters", http.StatusBadRequest) + lb.logger.Error("Invalid query parameters", "err", ceems_api_http.ErrInvalidRequest) + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorBadData, Err: ceems_api_http.ErrInvalidRequest}, lb.logger, nil) return } @@ -188,7 +191,8 @@ func (lb *loadBalancer) Serve(w http.ResponseWriter, r *http.Request) { return } - http.Error(w, "Service not available", http.StatusServiceUnavailable) + lb.logger.Error("Service not available", "err", ceems_api_http.ErrUnavailable) + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorUnavailable, Err: ceems_api_http.ErrUnavailable}, lb.logger, nil) } // errorHandlers sets up error handlers for backend servers. @@ -203,7 +207,7 @@ func (lb *loadBalancer) errorHandlers() { // If already retried the request, return error if !allowRetry(request) { lb.logger.Info("Max retry attempts reached, terminating", "address", request.RemoteAddr, "path", request.URL.Path) - http.Error(writer, "Service not available", http.StatusServiceUnavailable) + ceems_api_http.ErrorResponse[any](writer, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorUnavailable, Err: ceems_api_http.ErrUnavailable}, lb.logger, nil) return } diff --git a/pkg/lb/frontend/middleware.go b/pkg/lb/frontend/middleware.go index f46786f0..1e902faf 100644 --- a/pkg/lb/frontend/middleware.go +++ b/pkg/lb/frontend/middleware.go @@ -6,7 +6,6 @@ package frontend import ( "context" "database/sql" - "encoding/json" "errors" "fmt" "log/slog" @@ -20,7 +19,7 @@ import ( "time" ceems_api_base "github.com/ceems-dev/ceems/pkg/api/base" - ceems_api "github.com/ceems-dev/ceems/pkg/api/http" + ceems_api_http "github.com/ceems-dev/ceems/pkg/api/http" "github.com/ceems-dev/ceems/pkg/api/models" "github.com/ceems-dev/ceems/pkg/lb/base" "github.com/prometheus/common/config" @@ -121,7 +120,7 @@ func (c *ceems) adminUsers(ctx context.Context) ([]string, error) { // Check if DB is available if c.db != nil { - adminUsers, err = ceems_api.AdminUserNames(ctx, c.db) + adminUsers, err = ceems_api_http.AdminUserNames(ctx, c.db) if err != nil { return nil, err } @@ -254,22 +253,10 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler // Verify clusterID is in list of valid cluster IDs if !slices.Contains(amw.clusterIDs, reqParams.clusterID) { - amw.logger.Error("ClusterID header not found. Bad request", "url", r.URL) + amw.logger.Error("Invalid cluster ID", "found_id", reqParams.clusterID, "valid_ids", strings.Join(amw.clusterIDs, ","), "url", r.URL) // Write an error and stop the handler chain - w.WriteHeader(http.StatusBadRequest) - - response := ceems_api.Response[any]{ - Status: "error", - ErrorType: "bad_request", - Error: "invalid cluster ID. Set cluster ID using X-Ceems-Cluster-Id header in Prometheus datasource.", - } - - err := json.NewEncoder(w).Encode(&response) - if err != nil { - amw.logger.Error("Failed to encode response", "err", err) - w.Write([]byte("KO")) - } + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorBadData, Err: ceems_api_http.ErrInvalidClusterID}, amw.logger, nil) return } @@ -290,22 +277,10 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler // Check if username header is available loggedUser = r.Header.Get(ceems_api_base.GrafanaUserHeader) if loggedUser == "" { - amw.logger.Error("Grafana user Header not found. Denying authentication", "url", r.URL) + amw.logger.Error("User Header not found. Denying authentication", "url", r.URL) // Write an error and stop the handler chain - w.WriteHeader(http.StatusUnauthorized) - - response := ceems_api.Response[any]{ - Status: "error", - ErrorType: "unauthorized", - Error: "no user header found. Make sure to set send_user_header = true in [dataproxy] section of Grafana configuration file.", - } - - err := json.NewEncoder(w).Encode(&response) - if err != nil { - amw.logger.Error("Failed to encode response", "err", err) - w.Write([]byte("KO")) - } + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorUnauthorized, Err: ceems_api_http.ErrNoUser}, amw.logger, nil) return } @@ -324,19 +299,7 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler amw.logger.Error("Forbidden resource", "logged_user", loggedUser, "resource", r.URL.Path) // Write an error and stop the handler chain - w.WriteHeader(http.StatusForbidden) - - response := ceems_api.Response[any]{ - Status: "error", - ErrorType: "forbidden", - Error: "user do not have permissions to this resource", - } - - err := json.NewEncoder(w).Encode(&response) - if err != nil { - amw.logger.Error("Failed to encode response", "err", err) - w.Write([]byte("KO")) - } + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorForbidden, Err: ceems_api_http.ErrNoAccess}, amw.logger, nil) return } @@ -363,19 +326,7 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler reqParams.uuids, ) { // Write an error and stop the handler chain - w.WriteHeader(http.StatusForbidden) - - response := ceems_api.Response[any]{ - Status: "error", - ErrorType: "forbidden", - Error: "user do not have permissions to view unit metrics", - } - - err := json.NewEncoder(w).Encode(&response) - if err != nil { - amw.logger.Error("Failed to encode response", "err", err) - w.Write([]byte("KO")) - } + ceems_api_http.ErrorResponse[any](w, &ceems_api_http.APIError{Typ: ceems_api_http.ErrorForbidden, Err: ceems_api_http.ErrNoAuth}, amw.logger, nil) return } @@ -412,7 +363,7 @@ func (amw *authenticationMiddleware) isUserUnit( // Always prefer checking with DB connection directly if it is available // As DB query is way more faster than HTTP API request if amw.ceems.db != nil { - return ceems_api.VerifyOwnership(ctx, user, clusterIDs, uuids, amw.ceems.db, amw.logger) + return ceems_api_http.VerifyOwnership(ctx, user, clusterIDs, uuids, amw.ceems.db, amw.logger) } // If CEEMS URL is available make a API request diff --git a/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-api.txt b/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-api.txt index 4034d69c..c714bee3 100644 --- a/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-api.txt +++ b/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-api.txt @@ -1,3 +1,3 @@ -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view unit metrics"} -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view unit metrics"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view metrics of this job/pod/vm"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view metrics of this job/pod/vm"} diff --git a/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-db.txt b/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-db.txt index 4034d69c..c714bee3 100644 --- a/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-db.txt +++ b/pkg/lb/testdata/output/e2e-test-lb-forbid-user-query-db.txt @@ -1,3 +1,3 @@ -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view unit metrics"} -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view unit metrics"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view metrics of this job/pod/vm"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to view metrics of this job/pod/vm"} diff --git a/pkg/lb/testdata/output/e2e-test-lb-server-pyro-only.txt b/pkg/lb/testdata/output/e2e-test-lb-server-pyro-only.txt index a97f0431..c6818286 100644 --- a/pkg/lb/testdata/output/e2e-test-lb-server-pyro-only.txt +++ b/pkg/lb/testdata/output/e2e-test-lb-server-pyro-only.txt @@ -1 +1 @@ -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to this resource"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to access this resource"} diff --git a/pkg/lb/testdata/output/e2e-test-lb-server-tsdb-pyro-mix.txt b/pkg/lb/testdata/output/e2e-test-lb-server-tsdb-pyro-mix.txt index dd626632..2e585b0f 100644 --- a/pkg/lb/testdata/output/e2e-test-lb-server-tsdb-pyro-mix.txt +++ b/pkg/lb/testdata/output/e2e-test-lb-server-tsdb-pyro-mix.txt @@ -1,3 +1,3 @@ -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to this resource"} -{"status":"error","data":null,"errorType":"bad_request","error":"invalid cluster ID. Set cluster ID using X-Ceems-Cluster-Id header in Prometheus datasource."} -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to this resource"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to access this resource"} +{"status":"error","data":null,"errorType":"bad_data","error":"invalid ceems cluster id"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to access this resource"} diff --git a/pkg/lb/testdata/output/e2e-test-lb-server.txt b/pkg/lb/testdata/output/e2e-test-lb-server.txt index 41ed055e..0950fb43 100644 --- a/pkg/lb/testdata/output/e2e-test-lb-server.txt +++ b/pkg/lb/testdata/output/e2e-test-lb-server.txt @@ -1,2 +1,2 @@ -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to this resource"} -{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to this resource"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to access this resource"} +{"status":"error","data":null,"errorType":"forbidden","error":"user do not have permissions to access this resource"} From 7eaa2856a11a4f0edd62d2d96724b962b4c4fbe3 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Tue, 23 Sep 2025 10:48:37 +0200 Subject: [PATCH 2/3] fix: Ignore k8s pods that have not started * As we insert DB entries based on UUID and start time, pods that have not started will have N/A as start time and eventually when they start, they will have a correct start time. This will result in duplicate DB entries. To avoid this, insert into DB only after pods started. Signed-off-by: Mahendra Paipuri --- pkg/api/resource/k8s/manager.go | 24 ++++++++---- pkg/api/resource/k8s/manager_test.go | 56 ++++++++++++++-------------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/pkg/api/resource/k8s/manager.go b/pkg/api/resource/k8s/manager.go index e1f9de52..2c554828 100644 --- a/pkg/api/resource/k8s/manager.go +++ b/pkg/api/resource/k8s/manager.go @@ -191,7 +191,9 @@ func (k *k8sManager) fetchPods( // Transform pods into units units := make([]models.Unit, len(pods)) - for ipod, pod := range pods { + ipod := 0 + + for _, pod := range pods { // Convert CreatedAt to current time location createdAt := pod.CreationTimestamp.In(loc) @@ -288,17 +290,22 @@ func (k *k8sManager) fetchPods( elapsedTime = notAvailable } + // If startedAt is zero, ignore that pod. As we insert data into + // DB based on UUID and startedAt fields, including a compute unit + // with "N/A" start time and "real" start time (eventually when it starts) + // will create two entries for the same compute unit. We do not want that + // and so ignore if the pod hasnt started yet. + if startedAt.IsZero() { + continue + } + // Check if startedAt and endedAt are valid var startedAtString, endedAtString string var startedAtTS, endedAtTS int64 - if startedAt.IsZero() { - startedAtString = notAvailable - } else { - startedAtString = startedAt.Format(base.DatetimezoneLayout) - startedAtTS = startedAt.UnixMilli() - } + startedAtString = startedAt.Format(base.DatetimezoneLayout) + startedAtTS = startedAt.UnixMilli() if endedAt.IsZero() { endedAtString = notAvailable @@ -418,11 +425,12 @@ func (k *k8sManager) fetchPods( Allocation: allocation, Tags: tags, } + ipod++ } k.logger.Info("k8s pods fetched", "cluster_id", k.cluster.ID, "start", start, "end", end, "num_pods", len(units)) - return units + return units[:ipod] } func (k *k8sManager) fetchUserNSs(ctx context.Context, current time.Time) ([]models.User, []models.Project) { diff --git a/pkg/api/resource/k8s/manager_test.go b/pkg/api/resource/k8s/manager_test.go index 08e91853..58777fc0 100644 --- a/pkg/api/resource/k8s/manager_test.go +++ b/pkg/api/resource/k8s/manager_test.go @@ -25,32 +25,32 @@ var ( noOpLogger = slog.New(slog.DiscardHandler) expectedUnits = map[string]models.Unit{ - "3a61e77f-1538-476b-8231-5af9eed40fdc": { - ClusterID: "k8s-0", - ResourceManager: "k8s", - UUID: "3a61e77f-1538-476b-8231-5af9eed40fdc", - Name: "pod31", - Project: "ns3", - Group: "", - User: "kusr3", - CreatedAt: "2025-07-07T11:16:56+0200", - StartedAt: "N/A", - EndedAt: "N/A", - CreatedAtTS: 1751879816000, - StartedAtTS: 0, - EndedAtTS: 0, - Elapsed: "N/A", - State: "Pending", - Allocation: models.Generic{ - "mem": 3.221225472e+09, "nvidia.com/gpu": 1.0, "nvidia.com/mig-4g.20gb": 2.0, "vcpus": 3.0, - }, - TotalTime: models.MetricMap{ - "alloc_cpumemtime": 0, "alloc_cputime": 0, "alloc_gpumemtime": 0, "alloc_gputime": 0, "walltime": 0, - }, - Tags: models.Generic{ - "annotations": map[string]string{"ceems.io/created-by": "kusr3"}, "qos": "Burstable", - }, - }, + // "3a61e77f-1538-476b-8231-5af9eed40fdc": { + // ClusterID: "k8s-0", + // ResourceManager: "k8s", + // UUID: "3a61e77f-1538-476b-8231-5af9eed40fdc", + // Name: "pod31", + // Project: "ns3", + // Group: "", + // User: "kusr3", + // CreatedAt: "2025-07-07T11:16:56+0200", + // StartedAt: "N/A", + // EndedAt: "N/A", + // CreatedAtTS: 1751879816000, + // StartedAtTS: 0, + // EndedAtTS: 0, + // Elapsed: "N/A", + // State: "Pending", + // Allocation: models.Generic{ + // "mem": 3.221225472e+09, "nvidia.com/gpu": 1.0, "nvidia.com/mig-4g.20gb": 2.0, "vcpus": 3.0, + // }, + // TotalTime: models.MetricMap{ + // "alloc_cpumemtime": 0, "alloc_cputime": 0, "alloc_gpumemtime": 0, "alloc_gputime": 0, "walltime": 0, + // }, + // Tags: models.Generic{ + // "annotations": map[string]string{"ceems.io/created-by": "kusr3"}, "qos": "Burstable", + // }, + // }, "6c22124f-e9a7-450b-8915-9bf3e0716d78": { ClusterID: "k8s-0", ResourceManager: "k8s", @@ -136,14 +136,14 @@ var ( {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "rb2", Projects: models.List{"ns2"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "kusr1", Projects: models.List{"ns1"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "kusr2", Projects: models.List{"ns2"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, - {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "kusr3", Projects: models.List{"ns3"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, + // {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "kusr3", Projects: models.List{"ns3"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "file1", Projects: models.List{"ns1"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "file2", Projects: models.List{"ns1"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "file3", Projects: models.List{"ns3"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "ns2:" + base.ServiceAccountUser, Projects: models.List{"ns2"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, } expectedProjects = []models.Project{ - {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "ns3", Users: models.List{"file3", "kusr3", "rb3"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, + {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "ns3", Users: models.List{"file3", "rb3"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "ns1", Users: models.List{"file1", "file2", "kusr1", "rb1"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, {ClusterID: "k8s-0", ResourceManager: "k8s", Name: "ns2", Users: models.List{"kusr2", "ns2:" + base.ServiceAccountUser, "rb1", "rb2"}, LastUpdatedAt: "2025-07-07T12:15:00+0200"}, } From 18f3b54fdcc58c23b1032e5c149725bc54ba2cb5 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Tue, 23 Sep 2025 10:49:19 +0200 Subject: [PATCH 3/3] docs: Add Helm section in installation docs * Add dependabot config to update GH actions as well Signed-off-by: Mahendra Paipuri --- .github/dependabot.yml | 9 +++++++ website/docs/installation/ansible.md | 2 +- website/docs/installation/build.md | 2 +- website/docs/installation/containers.md | 2 +- website/docs/installation/helm.md | 26 +++++++++++++++++++ website/docs/usage/ceems-exporter.md | 1 - website/docusaurus.config.ts | 4 +++ .../src/components/HomepageFeatures/index.tsx | 2 +- 8 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 website/docs/installation/helm.md diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 3d9898df..897640bd 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -22,3 +22,12 @@ updates: npm-dependencies: patterns: - "*" + + - package-ecosystem: "github-actions" # See documentation for possible values + directory: "/" # Workflow files stored in the default location of `.github/workflows` + schedule: + interval: "weekly" + groups: + actions-dependencies: + patterns: + - "*" diff --git a/website/docs/installation/ansible.md b/website/docs/installation/ansible.md index bd62e48f..3e1b9980 100644 --- a/website/docs/installation/ansible.md +++ b/website/docs/installation/ansible.md @@ -1,5 +1,5 @@ --- -sidebar_position: 5 +sidebar_position: 6 --- # Ansible diff --git a/website/docs/installation/build.md b/website/docs/installation/build.md index e8c35e03..a1cccea7 100644 --- a/website/docs/installation/build.md +++ b/website/docs/installation/build.md @@ -1,5 +1,5 @@ --- -sidebar_position: 6 +sidebar_position: 7 --- # Build from Source diff --git a/website/docs/installation/containers.md b/website/docs/installation/containers.md index b25a008f..a6e250bf 100644 --- a/website/docs/installation/containers.md +++ b/website/docs/installation/containers.md @@ -1,5 +1,5 @@ --- -sidebar_position: 4 +sidebar_position: 5 --- # Containers diff --git a/website/docs/installation/helm.md b/website/docs/installation/helm.md new file mode 100644 index 00000000..e740c628 --- /dev/null +++ b/website/docs/installation/helm.md @@ -0,0 +1,26 @@ +--- +sidebar_position: 4 +--- + +# Helm + +For k8s deployments, CEEMS can be installed using [Helm](https://helm.sh/). +The charts are available at [Helm repository](https://@ceemsOrg@.github.io/helm-charts). + +The helm repository can be added using the following command: + +```bash +helm repo add @ceemsOrg@ https://@ceemsOrg@.github.io/helm-charts +``` + +Once the repository has been successfully added, it can be installed using: + +```bash +helm install -n ceems --create-namespace ceems ceems-dev/kube-ceems +``` + +This will create a new namespace `ceems` and install all the CEEMS components along +with Prometheus and Grafana. + +More instructions on how to use chart values can be found in the chart's +[README](https://github.com/@ceemsOrg@/helm-charts/blob/main/charts/kube-ceems/README.md) diff --git a/website/docs/usage/ceems-exporter.md b/website/docs/usage/ceems-exporter.md index 6fe00bbb..76016144 100644 --- a/website/docs/usage/ceems-exporter.md +++ b/website/docs/usage/ceems-exporter.md @@ -16,7 +16,6 @@ The following collectors are enabled by default: - `cpu`: Node-level CPU statistics - `memory`: Node-level memory statistics -- `rapl`: RAPL energy counters By default, the CEEMS exporter exposes metrics on all interfaces, port `9010`, and at the `/metrics` endpoint. This can be changed by setting the `--web.listen-address` CLI flag: diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts index 06e35369..a6e1b63a 100644 --- a/website/docusaurus.config.ts +++ b/website/docusaurus.config.ts @@ -247,6 +247,10 @@ const config: Config = { label: "GitHub", href: `https://github.com/${organizationName}/${projectName}`, }, + { + label: "Helm Charts", + href: `https://github.com/${organizationName}/helm-charts`, + }, ], }, ], diff --git a/website/src/components/HomepageFeatures/index.tsx b/website/src/components/HomepageFeatures/index.tsx index 18a88b2f..14dfa4ac 100644 --- a/website/src/components/HomepageFeatures/index.tsx +++ b/website/src/components/HomepageFeatures/index.tsx @@ -16,7 +16,7 @@ const FeatureList: FeatureItem[] = [ <> CEEMS was designed to be resource manager agnostic. Although thoeritically it can support many resource managers, we are focusing to - support SLURM, Openstack and Kubernetes deployed on baremetal. + support SLURM, Openstack and Kubernetes. ), },