Skip to content

Commit c64b90d

Browse files
committed
draft default flavor behavior
Signed-off-by: dciangot <dciangot@cern.ch>
1 parent 768a352 commit c64b90d

File tree

6 files changed

+306
-26
lines changed

6 files changed

+306
-26
lines changed

docker/SlurmConfig.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,23 @@ ContainerRuntime: singularity
2020
EnrootDefaultOptions: ["--rw"]
2121
EnrootPrefix: ""
2222
EnrootPath: enroot
23+
24+
# Flavor configuration - predefined sets of SLURM submission options
25+
DefaultFlavor: "default"
26+
Flavors:
27+
default:
28+
Name: "default"
29+
Description: "Standard CPU job (2 cores, 8GB RAM)"
30+
CPUDefault: 2
31+
MemoryDefault: "8G"
32+
SlurmFlags:
33+
- "--partition=normal"
34+
35+
gpu-nvidia:
36+
Name: "gpu-nvidia"
37+
Description: "GPU job with NVIDIA GPU (4 cores, 32GB RAM, 1 GPU)"
38+
CPUDefault: 4
39+
MemoryDefault: "32G"
40+
SlurmFlags:
41+
- "--gres=gpu:1"
42+
- "--partition=gpu"

examples/config/SlurmConfig.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,35 @@ BashPath: /bin/bash
1717
VerboseLogging: true
1818
ErrorsOnlyLogging: false
1919
EnableProbes: true
20+
21+
# Flavor configuration - predefined sets of SLURM submission options
22+
DefaultFlavor: "default"
23+
Flavors:
24+
default:
25+
Name: "default"
26+
Description: "Standard CPU job (4 cores, 16GB RAM)"
27+
CPUDefault: 4
28+
MemoryDefault: "16G"
29+
SlurmFlags:
30+
- "--partition=cpu"
31+
- "--time=01:00:00"
32+
33+
gpu-nvidia:
34+
Name: "gpu-nvidia"
35+
Description: "GPU job with NVIDIA GPU (8 cores, 64GB RAM, 1 GPU)"
36+
CPUDefault: 8
37+
MemoryDefault: "64G"
38+
SlurmFlags:
39+
- "--gres=gpu:1"
40+
- "--partition=gpu"
41+
- "--time=04:00:00"
42+
43+
high-io:
44+
Name: "high-io"
45+
Description: "High I/O job (16 cores, 32GB RAM, fast storage)"
46+
CPUDefault: 16
47+
MemoryDefault: "32G"
48+
SlurmFlags:
49+
- "--partition=fast-io"
50+
- "--time=02:00:00"
51+
- "--constraint=ssd"

pkg/slurm/Create.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,15 @@ func (h *SidecarHandler) SubmitHandler(w http.ResponseWriter, r *http.Request) {
5757
metadata := data.Pod.ObjectMeta
5858
filesPath := h.Config.DataRootFolder + data.Pod.Namespace + "-" + string(data.Pod.UID)
5959

60+
// Resolve flavor to apply default CPU and memory
61+
flavor, err := resolveFlavor(spanCtx, h.Config, metadata, data.Pod.Spec.Containers)
62+
if err != nil {
63+
log.G(h.Ctx).Error("Failed to resolve flavor: ", err)
64+
statusCode = http.StatusInternalServerError
65+
h.handleError(spanCtx, w, statusCode, err)
66+
return
67+
}
68+
6069
var runtime_command_pod []ContainerCommand
6170
var resourceLimits ResourceLimits
6271

@@ -69,6 +78,20 @@ func (h *SidecarHandler) SubmitHandler(w http.ResponseWriter, r *http.Request) {
6978
cpuLimit := int64(0)
7079
memoryLimit := int64(0)
7180

81+
// Apply flavor defaults if available
82+
if flavor != nil {
83+
if flavor.CPUDefault > 0 {
84+
cpuLimit = flavor.CPUDefault
85+
maxCPULimit = int(flavor.CPUDefault)
86+
log.G(h.Ctx).Infof("Applying CPU default from flavor '%s': %d", flavor.FlavorName, flavor.CPUDefault)
87+
}
88+
if flavor.MemoryDefault > 0 {
89+
memoryLimit = flavor.MemoryDefault
90+
maxMemoryLimit = int(flavor.MemoryDefault)
91+
log.G(h.Ctx).Infof("Applying memory default from flavor '%s': %d bytes", flavor.FlavorName, flavor.MemoryDefault)
92+
}
93+
}
94+
7295
for i, container := range containers {
7396
log.G(h.Ctx).Info("- Beginning script generation for container " + container.Name)
7497

pkg/slurm/func.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,32 @@ func NewSlurmConfig() (SlurmConfig, error) {
125125
if len(SlurmConfigInst.SingularityDefaultOptions) == 0 {
126126
SlurmConfigInst.SingularityDefaultOptions = []string{"--nv", "--no-eval", "--containall"}
127127
}
128+
129+
// Validate and log flavor configuration
130+
if len(SlurmConfigInst.Flavors) > 0 {
131+
log.G(context.Background()).Infof("Loaded %d flavor(s):", len(SlurmConfigInst.Flavors))
132+
for name, flavor := range SlurmConfigInst.Flavors {
133+
if flavor.Name == "" {
134+
log.G(context.Background()).Warningf("Flavor '%s' has no Name field set, using key as name", name)
135+
flavor.Name = name
136+
SlurmConfigInst.Flavors[name] = flavor
137+
}
138+
log.G(context.Background()).Infof(" - %s: %s (CPU: %d, Memory: %s, SLURM flags: %d)",
139+
flavor.Name, flavor.Description, flavor.CPUDefault, flavor.MemoryDefault, len(flavor.SlurmFlags))
140+
}
141+
142+
// Validate DefaultFlavor if set
143+
if SlurmConfigInst.DefaultFlavor != "" {
144+
if _, exists := SlurmConfigInst.Flavors[SlurmConfigInst.DefaultFlavor]; !exists {
145+
log.G(context.Background()).Warningf("DefaultFlavor '%s' not found in Flavors map, ignoring", SlurmConfigInst.DefaultFlavor)
146+
SlurmConfigInst.DefaultFlavor = ""
147+
} else {
148+
log.G(context.Background()).Infof("Default flavor set to: %s", SlurmConfigInst.DefaultFlavor)
149+
}
150+
}
151+
} else {
152+
log.G(context.Background()).Info("No flavors configured, using default behavior")
153+
}
128154
}
129155
return SlurmConfigInst, nil
130156
}

pkg/slurm/prepare.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,14 @@ type ResourceLimits struct {
5353
Memory int64
5454
}
5555

56+
// FlavorResolution holds the resolved flavor information
57+
type FlavorResolution struct {
58+
FlavorName string
59+
CPUDefault int64
60+
MemoryDefault int64 // in bytes
61+
SlurmFlags []string
62+
}
63+
5664
// stringToHex encodes the provided str string into a hex string and removes all trailing redundant zeroes to keep the output more compact
5765
func stringToHex(str string) string {
5866
var buffer bytes.Buffer
@@ -94,6 +102,151 @@ func parsingTimeFromString(Ctx context.Context, stringTime string, timestampForm
94102
return parsedTime, nil
95103
}
96104

105+
// parseMemoryString converts memory string formats (e.g., "16G", "32000M", "1024") to bytes
106+
func parseMemoryString(memStr string) (int64, error) {
107+
if memStr == "" {
108+
return 0, nil
109+
}
110+
111+
memStr = strings.TrimSpace(strings.ToUpper(memStr))
112+
113+
// Check for suffix
114+
if strings.HasSuffix(memStr, "G") || strings.HasSuffix(memStr, "GB") {
115+
numStr := strings.TrimSuffix(strings.TrimSuffix(memStr, "B"), "G")
116+
val, err := strconv.ParseInt(numStr, 10, 64)
117+
if err != nil {
118+
return 0, fmt.Errorf("invalid memory format %s: %w", memStr, err)
119+
}
120+
return val * 1024 * 1024 * 1024, nil
121+
} else if strings.HasSuffix(memStr, "M") || strings.HasSuffix(memStr, "MB") {
122+
numStr := strings.TrimSuffix(strings.TrimSuffix(memStr, "B"), "M")
123+
val, err := strconv.ParseInt(numStr, 10, 64)
124+
if err != nil {
125+
return 0, fmt.Errorf("invalid memory format %s: %w", memStr, err)
126+
}
127+
return val * 1024 * 1024, nil
128+
} else if strings.HasSuffix(memStr, "K") || strings.HasSuffix(memStr, "KB") {
129+
numStr := strings.TrimSuffix(strings.TrimSuffix(memStr, "B"), "K")
130+
val, err := strconv.ParseInt(numStr, 10, 64)
131+
if err != nil {
132+
return 0, fmt.Errorf("invalid memory format %s: %w", memStr, err)
133+
}
134+
return val * 1024, nil
135+
}
136+
137+
// No suffix, assume bytes
138+
val, err := strconv.ParseInt(memStr, 10, 64)
139+
if err != nil {
140+
return 0, fmt.Errorf("invalid memory format %s: %w", memStr, err)
141+
}
142+
return val, nil
143+
}
144+
145+
// detectGPUResources checks if the pod requests GPU resources and returns the GPU count
146+
func detectGPUResources(Ctx context.Context, containers []v1.Container) int64 {
147+
var totalGPUs int64 = 0
148+
149+
for _, container := range containers {
150+
// Check for nvidia.com/gpu
151+
if gpuLimit, ok := container.Resources.Limits["nvidia.com/gpu"]; ok {
152+
gpuCount := gpuLimit.Value()
153+
if gpuCount > 0 {
154+
log.G(Ctx).Infof("Detected %d NVIDIA GPU(s) requested in container %s", gpuCount, container.Name)
155+
totalGPUs += gpuCount
156+
}
157+
}
158+
159+
// Check for amd.com/gpu
160+
if gpuLimit, ok := container.Resources.Limits["amd.com/gpu"]; ok {
161+
gpuCount := gpuLimit.Value()
162+
if gpuCount > 0 {
163+
log.G(Ctx).Infof("Detected %d AMD GPU(s) requested in container %s", gpuCount, container.Name)
164+
totalGPUs += gpuCount
165+
}
166+
}
167+
}
168+
169+
return totalGPUs
170+
}
171+
172+
// resolveFlavor determines which flavor to use based on annotations, GPU detection, and default flavor
173+
func resolveFlavor(Ctx context.Context, config SlurmConfig, metadata metav1.ObjectMeta, containers []v1.Container) (*FlavorResolution, error) {
174+
// No flavors configured, return nil
175+
if len(config.Flavors) == 0 {
176+
return nil, nil
177+
}
178+
179+
var selectedFlavor *FlavorConfig
180+
var flavorName string
181+
182+
// Priority 1: Check for explicit flavor annotation
183+
if annotationFlavor, ok := metadata.Annotations["slurm-job.vk.io/flavor"]; ok {
184+
if flavor, exists := config.Flavors[annotationFlavor]; exists {
185+
selectedFlavor = &flavor
186+
flavorName = annotationFlavor
187+
log.G(Ctx).Infof("Using flavor '%s' from annotation", flavorName)
188+
} else {
189+
log.G(Ctx).Warningf("Flavor '%s' specified in annotation not found, falling back to auto-detection", annotationFlavor)
190+
}
191+
}
192+
193+
// Priority 2: Auto-detect GPU and select GPU flavor
194+
if selectedFlavor == nil {
195+
gpuCount := detectGPUResources(Ctx, containers)
196+
if gpuCount > 0 {
197+
// Look for a flavor with GPU in the name or SLURM flags
198+
for name, flavor := range config.Flavors {
199+
// Check if flavor has GPU-related SLURM flags
200+
hasGPUFlag := false
201+
for _, flag := range flavor.SlurmFlags {
202+
if strings.Contains(flag, "--gres=gpu") || strings.Contains(flag, "gpu") {
203+
hasGPUFlag = true
204+
break
205+
}
206+
}
207+
208+
if hasGPUFlag || strings.Contains(strings.ToLower(name), "gpu") {
209+
selectedFlavor = &flavor
210+
flavorName = name
211+
log.G(Ctx).Infof("Auto-detected GPU resources, using flavor '%s'", flavorName)
212+
break
213+
}
214+
}
215+
216+
if selectedFlavor == nil {
217+
log.G(Ctx).Warningf("GPU resources detected but no GPU flavor found, falling back to default")
218+
}
219+
}
220+
}
221+
222+
// Priority 3: Use default flavor
223+
if selectedFlavor == nil && config.DefaultFlavor != "" {
224+
if flavor, exists := config.Flavors[config.DefaultFlavor]; exists {
225+
selectedFlavor = &flavor
226+
flavorName = config.DefaultFlavor
227+
log.G(Ctx).Infof("Using default flavor '%s'", flavorName)
228+
}
229+
}
230+
231+
// No flavor selected
232+
if selectedFlavor == nil {
233+
return nil, nil
234+
}
235+
236+
// Parse memory default
237+
memoryBytes, err := parseMemoryString(selectedFlavor.MemoryDefault)
238+
if err != nil {
239+
return nil, fmt.Errorf("failed to parse memory for flavor %s: %w", flavorName, err)
240+
}
241+
242+
return &FlavorResolution{
243+
FlavorName: flavorName,
244+
CPUDefault: selectedFlavor.CPUDefault,
245+
MemoryDefault: memoryBytes,
246+
SlurmFlags: selectedFlavor.SlurmFlags,
247+
}, nil
248+
}
249+
97250
// CreateDirectories is just a function to be sure directories exists at runtime
98251
func (h *SidecarHandler) CreateDirectories() error {
99252
path := h.Config.DataRootFolder
@@ -597,8 +750,23 @@ func produceSLURMScript(
597750
cpuLimitSetFromFlags := false
598751
memoryLimitSetFromFlags := false
599752

753+
// Resolve flavor and apply flavor SLURM flags
754+
flavor, err := resolveFlavor(Ctx, config, metadata, pod.Spec.Containers)
755+
if err != nil {
756+
log.G(Ctx).Error("Failed to resolve flavor: ", err)
757+
return "", err
758+
}
759+
600760
var sbatchFlagsFromArgo []string
601761
sbatchFlagsAsString := ""
762+
763+
// Add flavor SLURM flags first (lowest priority)
764+
if flavor != nil && len(flavor.SlurmFlags) > 0 {
765+
log.G(Ctx).Infof("Applying %d SLURM flag(s) from flavor '%s'", len(flavor.SlurmFlags), flavor.FlavorName)
766+
sbatchFlagsFromArgo = append(sbatchFlagsFromArgo, flavor.SlurmFlags...)
767+
}
768+
769+
// Then process annotation flags (higher priority)
602770
if slurmFlags, ok := metadata.Annotations["slurm-job.vk.io/flags"]; ok {
603771

604772
reCpu := regexp.MustCompile(`--cpus-per-task(?:[ =]\S+)?`)

pkg/slurm/types.go

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,45 @@
11
package slurm
22

3+
// FlavorConfig holds the configuration for a specific flavor
4+
type FlavorConfig struct {
5+
Name string `yaml:"Name"`
6+
Description string `yaml:"Description"`
7+
CPUDefault int64 `yaml:"CPUDefault"`
8+
MemoryDefault string `yaml:"MemoryDefault"` // e.g., "16G", "32000M", "1024"
9+
SlurmFlags []string `yaml:"SlurmFlags"`
10+
}
11+
312
// InterLinkConfig holds the whole configuration
413
type SlurmConfig struct {
5-
VKConfigPath string `yaml:"VKConfigPath"`
6-
Sbatchpath string `yaml:"SbatchPath"`
7-
Scancelpath string `yaml:"ScancelPath"`
8-
Squeuepath string `yaml:"SqueuePath"`
9-
Sinfopath string `yaml:"SinfoPath"`
10-
Sidecarport string `yaml:"SidecarPort"`
11-
Socket string `yaml:"Socket"`
12-
ExportPodData bool `yaml:"ExportPodData"`
13-
Commandprefix string `yaml:"CommandPrefix"`
14-
ImagePrefix string `yaml:"ImagePrefix"`
15-
DataRootFolder string `yaml:"DataRootFolder"`
16-
Namespace string `yaml:"Namespace"`
17-
Tsocks bool `yaml:"Tsocks"`
18-
Tsockspath string `yaml:"TsocksPath"`
19-
Tsockslogin string `yaml:"TsocksLoginNode"`
20-
BashPath string `yaml:"BashPath"`
21-
VerboseLogging bool `yaml:"VerboseLogging"`
22-
ErrorsOnlyLogging bool `yaml:"ErrorsOnlyLogging"`
23-
SingularityDefaultOptions []string `yaml:"SingularityDefaultOptions"`
24-
SingularityPrefix string `yaml:"SingularityPrefix"`
25-
SingularityPath string `yaml:"SingularityPath"`
26-
EnableProbes bool `yaml:"EnableProbes"`
14+
VKConfigPath string `yaml:"VKConfigPath"`
15+
Sbatchpath string `yaml:"SbatchPath"`
16+
Scancelpath string `yaml:"ScancelPath"`
17+
Squeuepath string `yaml:"SqueuePath"`
18+
Sinfopath string `yaml:"SinfoPath"`
19+
Sidecarport string `yaml:"SidecarPort"`
20+
Socket string `yaml:"Socket"`
21+
ExportPodData bool `yaml:"ExportPodData"`
22+
Commandprefix string `yaml:"CommandPrefix"`
23+
ImagePrefix string `yaml:"ImagePrefix"`
24+
DataRootFolder string `yaml:"DataRootFolder"`
25+
Namespace string `yaml:"Namespace"`
26+
Tsocks bool `yaml:"Tsocks"`
27+
Tsockspath string `yaml:"TsocksPath"`
28+
Tsockslogin string `yaml:"TsocksLoginNode"`
29+
BashPath string `yaml:"BashPath"`
30+
VerboseLogging bool `yaml:"VerboseLogging"`
31+
ErrorsOnlyLogging bool `yaml:"ErrorsOnlyLogging"`
32+
SingularityDefaultOptions []string `yaml:"SingularityDefaultOptions"`
33+
SingularityPrefix string `yaml:"SingularityPrefix"`
34+
SingularityPath string `yaml:"SingularityPath"`
35+
EnableProbes bool `yaml:"EnableProbes"`
2736
set bool
28-
EnrootDefaultOptions []string `yaml:"EnrootDefaultOptions" default:"[\"--rw\"]"`
29-
EnrootPrefix string `yaml:"EnrootPrefix"`
30-
EnrootPath string `yaml:"EnrootPath"`
31-
ContainerRuntime string `yaml:"ContainerRuntime" default:"singularity"` // "singularity" or "enroot"
37+
EnrootDefaultOptions []string `yaml:"EnrootDefaultOptions" default:"[\"--rw\"]"`
38+
EnrootPrefix string `yaml:"EnrootPrefix"`
39+
EnrootPath string `yaml:"EnrootPath"`
40+
ContainerRuntime string `yaml:"ContainerRuntime" default:"singularity"` // "singularity" or "enroot"
41+
Flavors map[string]FlavorConfig `yaml:"Flavors"`
42+
DefaultFlavor string `yaml:"DefaultFlavor"`
3243
}
3344

3445
type CreateStruct struct {

0 commit comments

Comments
 (0)