@@ -53,6 +53,14 @@ type ResourceLimits struct {
5353 Memory int64
5454}
5555
56+ // FlavorResolution holds the resolved flavor information
57+ type FlavorResolution struct {
58+ FlavorName string
59+ CPUDefault int64
60+ MemoryDefault int64 // in bytes
61+ SlurmFlags []string
62+ }
63+
5664// stringToHex encodes the provided str string into a hex string and removes all trailing redundant zeroes to keep the output more compact
5765func stringToHex (str string ) string {
5866 var buffer bytes.Buffer
@@ -94,6 +102,151 @@ func parsingTimeFromString(Ctx context.Context, stringTime string, timestampForm
94102 return parsedTime , nil
95103}
96104
105+ // parseMemoryString converts memory string formats (e.g., "16G", "32000M", "1024") to bytes
106+ func parseMemoryString (memStr string ) (int64 , error ) {
107+ if memStr == "" {
108+ return 0 , nil
109+ }
110+
111+ memStr = strings .TrimSpace (strings .ToUpper (memStr ))
112+
113+ // Check for suffix
114+ if strings .HasSuffix (memStr , "G" ) || strings .HasSuffix (memStr , "GB" ) {
115+ numStr := strings .TrimSuffix (strings .TrimSuffix (memStr , "B" ), "G" )
116+ val , err := strconv .ParseInt (numStr , 10 , 64 )
117+ if err != nil {
118+ return 0 , fmt .Errorf ("invalid memory format %s: %w" , memStr , err )
119+ }
120+ return val * 1024 * 1024 * 1024 , nil
121+ } else if strings .HasSuffix (memStr , "M" ) || strings .HasSuffix (memStr , "MB" ) {
122+ numStr := strings .TrimSuffix (strings .TrimSuffix (memStr , "B" ), "M" )
123+ val , err := strconv .ParseInt (numStr , 10 , 64 )
124+ if err != nil {
125+ return 0 , fmt .Errorf ("invalid memory format %s: %w" , memStr , err )
126+ }
127+ return val * 1024 * 1024 , nil
128+ } else if strings .HasSuffix (memStr , "K" ) || strings .HasSuffix (memStr , "KB" ) {
129+ numStr := strings .TrimSuffix (strings .TrimSuffix (memStr , "B" ), "K" )
130+ val , err := strconv .ParseInt (numStr , 10 , 64 )
131+ if err != nil {
132+ return 0 , fmt .Errorf ("invalid memory format %s: %w" , memStr , err )
133+ }
134+ return val * 1024 , nil
135+ }
136+
137+ // No suffix, assume bytes
138+ val , err := strconv .ParseInt (memStr , 10 , 64 )
139+ if err != nil {
140+ return 0 , fmt .Errorf ("invalid memory format %s: %w" , memStr , err )
141+ }
142+ return val , nil
143+ }
144+
145+ // detectGPUResources checks if the pod requests GPU resources and returns the GPU count
146+ func detectGPUResources (Ctx context.Context , containers []v1.Container ) int64 {
147+ var totalGPUs int64 = 0
148+
149+ for _ , container := range containers {
150+ // Check for nvidia.com/gpu
151+ if gpuLimit , ok := container .Resources .Limits ["nvidia.com/gpu" ]; ok {
152+ gpuCount := gpuLimit .Value ()
153+ if gpuCount > 0 {
154+ log .G (Ctx ).Infof ("Detected %d NVIDIA GPU(s) requested in container %s" , gpuCount , container .Name )
155+ totalGPUs += gpuCount
156+ }
157+ }
158+
159+ // Check for amd.com/gpu
160+ if gpuLimit , ok := container .Resources .Limits ["amd.com/gpu" ]; ok {
161+ gpuCount := gpuLimit .Value ()
162+ if gpuCount > 0 {
163+ log .G (Ctx ).Infof ("Detected %d AMD GPU(s) requested in container %s" , gpuCount , container .Name )
164+ totalGPUs += gpuCount
165+ }
166+ }
167+ }
168+
169+ return totalGPUs
170+ }
171+
172+ // resolveFlavor determines which flavor to use based on annotations, GPU detection, and default flavor
173+ func resolveFlavor (Ctx context.Context , config SlurmConfig , metadata metav1.ObjectMeta , containers []v1.Container ) (* FlavorResolution , error ) {
174+ // No flavors configured, return nil
175+ if len (config .Flavors ) == 0 {
176+ return nil , nil
177+ }
178+
179+ var selectedFlavor * FlavorConfig
180+ var flavorName string
181+
182+ // Priority 1: Check for explicit flavor annotation
183+ if annotationFlavor , ok := metadata .Annotations ["slurm-job.vk.io/flavor" ]; ok {
184+ if flavor , exists := config .Flavors [annotationFlavor ]; exists {
185+ selectedFlavor = & flavor
186+ flavorName = annotationFlavor
187+ log .G (Ctx ).Infof ("Using flavor '%s' from annotation" , flavorName )
188+ } else {
189+ log .G (Ctx ).Warningf ("Flavor '%s' specified in annotation not found, falling back to auto-detection" , annotationFlavor )
190+ }
191+ }
192+
193+ // Priority 2: Auto-detect GPU and select GPU flavor
194+ if selectedFlavor == nil {
195+ gpuCount := detectGPUResources (Ctx , containers )
196+ if gpuCount > 0 {
197+ // Look for a flavor with GPU in the name or SLURM flags
198+ for name , flavor := range config .Flavors {
199+ // Check if flavor has GPU-related SLURM flags
200+ hasGPUFlag := false
201+ for _ , flag := range flavor .SlurmFlags {
202+ if strings .Contains (flag , "--gres=gpu" ) || strings .Contains (flag , "gpu" ) {
203+ hasGPUFlag = true
204+ break
205+ }
206+ }
207+
208+ if hasGPUFlag || strings .Contains (strings .ToLower (name ), "gpu" ) {
209+ selectedFlavor = & flavor
210+ flavorName = name
211+ log .G (Ctx ).Infof ("Auto-detected GPU resources, using flavor '%s'" , flavorName )
212+ break
213+ }
214+ }
215+
216+ if selectedFlavor == nil {
217+ log .G (Ctx ).Warningf ("GPU resources detected but no GPU flavor found, falling back to default" )
218+ }
219+ }
220+ }
221+
222+ // Priority 3: Use default flavor
223+ if selectedFlavor == nil && config .DefaultFlavor != "" {
224+ if flavor , exists := config .Flavors [config .DefaultFlavor ]; exists {
225+ selectedFlavor = & flavor
226+ flavorName = config .DefaultFlavor
227+ log .G (Ctx ).Infof ("Using default flavor '%s'" , flavorName )
228+ }
229+ }
230+
231+ // No flavor selected
232+ if selectedFlavor == nil {
233+ return nil , nil
234+ }
235+
236+ // Parse memory default
237+ memoryBytes , err := parseMemoryString (selectedFlavor .MemoryDefault )
238+ if err != nil {
239+ return nil , fmt .Errorf ("failed to parse memory for flavor %s: %w" , flavorName , err )
240+ }
241+
242+ return & FlavorResolution {
243+ FlavorName : flavorName ,
244+ CPUDefault : selectedFlavor .CPUDefault ,
245+ MemoryDefault : memoryBytes ,
246+ SlurmFlags : selectedFlavor .SlurmFlags ,
247+ }, nil
248+ }
249+
97250// CreateDirectories is just a function to be sure directories exists at runtime
98251func (h * SidecarHandler ) CreateDirectories () error {
99252 path := h .Config .DataRootFolder
@@ -597,8 +750,23 @@ func produceSLURMScript(
597750 cpuLimitSetFromFlags := false
598751 memoryLimitSetFromFlags := false
599752
753+ // Resolve flavor and apply flavor SLURM flags
754+ flavor , err := resolveFlavor (Ctx , config , metadata , pod .Spec .Containers )
755+ if err != nil {
756+ log .G (Ctx ).Error ("Failed to resolve flavor: " , err )
757+ return "" , err
758+ }
759+
600760 var sbatchFlagsFromArgo []string
601761 sbatchFlagsAsString := ""
762+
763+ // Add flavor SLURM flags first (lowest priority)
764+ if flavor != nil && len (flavor .SlurmFlags ) > 0 {
765+ log .G (Ctx ).Infof ("Applying %d SLURM flag(s) from flavor '%s'" , len (flavor .SlurmFlags ), flavor .FlavorName )
766+ sbatchFlagsFromArgo = append (sbatchFlagsFromArgo , flavor .SlurmFlags ... )
767+ }
768+
769+ // Then process annotation flags (higher priority)
602770 if slurmFlags , ok := metadata .Annotations ["slurm-job.vk.io/flags" ]; ok {
603771
604772 reCpu := regexp .MustCompile (`--cpus-per-task(?:[ =]\S+)?` )
0 commit comments