deepgram · jkroll-deepgram · Sep 15, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ## Unreleased
 
+### Added
+
+- Exposed the ability to add custom TOML sections in api.toml and engine.toml via `customToml`
+
 ## [0.19.0] - 2025-09-12
 
 ### Added

@@ -199,6 +199,7 @@ If you encounter issues while deploying or using Deepgram, consider the followin
 | api.additionalAnnotations | object | `nil` | Additional annotations to add to the API deployment |
 | api.additionalLabels | object | `{}` | Additional labels to add to API resources |
 | api.affinity | object | `{}` | [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) to apply for API pods. |
+| api.customToml | string | `nil` | Custom TOML sections can be added to extend api.toml |
 | api.driverPool | object | `` | driverPool configures the backend pool of speech engines (generically referred to as "drivers" here). The API will load-balance among drivers in the standard pool; if one standard driver fails, the next one will be tried. |
 | api.driverPool.standard | object | `` | standard is the main driver pool to use. |
 | api.driverPool.standard.maxResponseSize | string | `"1073741824"` | Maximum response to deserialize from Driver (in bytes). Default is 1GB, expressed in bytes. |
@@ -252,6 +253,7 @@ If you encounter issues while deploying or using Deepgram, consider the followin
 | engine.chunking.speechToText.streaming.minDuration | float | `nil` | minDuration is the minimum audio duration for a STT chunk size for a streaming request |
 | engine.chunking.speechToText.streaming.step | float | `1` | step defines how often to return interim results, in seconds. This value may be lowered to increase the frequency of interim results. However, this also causes a significant decrease in the number of concurrent streams supported by a single GPU. Please contact your Deepgram Account representative for more details. |
 | engine.concurrencyLimit.activeRequests | int | `nil` | activeRequests limits the number of active requests handled by a single Engine container. If additional requests beyond the limit are sent, the API container forming the request will try a different Engine pod. If no Engine pods are able to accept the request, the API will return a 429 HTTP response to the client. The `nil` default means no limit will be set. |
+| engine.customToml | string | `nil` | Custom TOML sections can be added to extend engine.toml |
 | engine.features.streamingNer | bool | `true` | Enables format entity tags on streaming audio *if* a valid NER model is available. |
 | engine.halfPrecision.state | string | `"auto"` | Engine will automatically enable half precision operations if your GPU supports them. You can explicitly enable or disable this behavior with the state parameter which supports `"enable"`, `"disabled"`, and `"auto"`. |
 | engine.image.path | string | `"quay.io/deepgram/self-hosted-engine"` | path configures the image path to use for creating Engine containers. You may change this from the public Quay image path if you have imported Deepgram images into a private container registry. |

@@ -70,6 +70,10 @@ api:
       memory: "8Gi"
       cpu: "4000m"
 
+  # -- Custom TOML sections can be added here to extend api.toml
+  # customToml: |
+  #   [custom_section]
+
 engine:
   affinity:
     nodeAffinity:
@@ -95,6 +99,16 @@ engine:
   concurrencyLimit:
     activeRequests:
 
+  # -- Custom TOML sections can be added here to extend engine.toml
+  # customToml: |
+  #   # Preload models on engine startup for faster initial requests
+  #   # See https://deepgram.gitbook.io/help-center/how-can-i-pre-load-models-to-reduce-cold-start-latency
+  #   [preload_models]
+  #   models = [
+  #     # Example model preload configuration:
+  #     # { model = "general-nova-3", version = "2025-09-05.12808", language = "multi", format = false }
+  #   ]
+
   modelManager:
     volumes:
       aws:

@@ -108,3 +108,7 @@ data:
       {{- end }}
     {{- end }}
     {{- end }}
+
+    {{- if .Values.api.customToml }}
+    {{ .Values.api.customToml | nindent 4 }}
+    {{- end }}
@@ -67,3 +67,6 @@ data:
     [half_precision]
       state = "{{ .Values.engine.halfPrecision.state }}"
 
+    {{- if .Values.engine.customToml }}
+    {{ .Values.engine.customToml | nindent 4 }}
+    {{- end }}
@@ -373,6 +373,9 @@ api:
       # Default is 1GB, expressed in bytes.
       maxResponseSize: "1073741824"
 
+  # -- Custom TOML sections can be added to extend api.toml
+  customToml:
+
 engine:
   # -- namePrefix is the prefix to apply to the name of all K8s objects
   # associated with the Deepgram Engine containers.
@@ -482,6 +485,9 @@ engine:
     # to the client. The `nil` default means no limit will be set.
     activeRequests:
 
+  # -- Custom TOML sections can be added to extend engine.toml
+  customToml:
+
   # -- Configure Engine containers to listen for requests from API containers.
   # @default -- ``
   server: