basetenlabs · dsingal0 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/truss/contexts/image_builder/serving_image_builder.py b/truss/contexts/image_builder/serving_image_builder.py
@@ -378,17 +378,20 @@ def generate_docker_server_nginx_config(build_dir, config):
     nginx_filepath.write_text(nginx_content)
 
 
-def generate_docker_server_supervisord_config(build_dir, config):
-    supervisord_template = read_template_from_fs(
-        DOCKER_SERVER_TEMPLATES_DIR, "supervisord.conf.jinja"
+def generate_docker_server_wrapper_script(build_dir, config):
+    wrapper_template = read_template_from_fs(
+        DOCKER_SERVER_TEMPLATES_DIR, "server_wrapper.sh"
     )
     assert config.docker_server.start_command is not None, (
         "docker_server.start_command is required to use custom server"
     )
     start_command = config.docker_server.start_command
-    supervisord_contents = supervisord_template.render(start_command=start_command)
-    supervisord_filepath = build_dir / "supervisord.conf"
-    supervisord_filepath.write_text(supervisord_contents)
+    server_port = config.docker_server.server_port
+    wrapper_contents = wrapper_template.render(
+        start_command=start_command, server_port=server_port
+    )
+    wrapper_filepath = build_dir / "server_wrapper.sh"
+    wrapper_filepath.write_text(wrapper_contents)
 
 
 class ServingImageBuilderContext(TrussContext):
@@ -586,7 +589,7 @@ def prepare_image_build_dir(
 
             generate_docker_server_nginx_config(build_dir, config)
 
-            generate_docker_server_supervisord_config(build_dir, config)
+            generate_docker_server_wrapper_script(build_dir, config)
 
         # Override config.yml
         with (build_dir / CONFIG_FILE).open("w") as config_file:

diff --git a/truss/templates/docker_server/server_wrapper.sh b/truss/templates/docker_server/server_wrapper.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+set -euo pipefail
+
+# Enhanced shell script to match supervisord behavior for custom servers
+# Manages nginx and model server processes with auto-restart and proper output handling
+
+# Global variables for process management
+declare -A PROCESS_PIDS
+declare -A RESTART_COUNTS
+declare -A LAST_RESTART_TIME
+declare -A PROCESS_START_TIME
+SHUTDOWN_REQUESTED=false
+
+# Configuration matching supervisord defaults
+MAX_RESTART_ATTEMPTS=3
+RESTART_RESET_TIME=10  # Reset restart counter after 10 seconds of stable operation
+FATAL_STATE_GRACE_PERIOD=5  # Wait 5 seconds before declaring fatal state
+LINEAR_BACKOFF_INTERVAL=1  # supervisord uses linear backoff by default
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2
+}
+
+# Function to start nginx with output redirection
+start_nginx() {
+    log "Starting nginx..."
+
+    # Start nginx with output redirected to stdout (like supervisord's stdout_logfile=/dev/fd/1)
+    nginx -g "daemon off;" &
+    local nginx_pid=$!
+
+    # Give nginx a moment to start
+    sleep 1
+
+    # Check if nginx started successfully
+    if kill -0 "$nginx_pid" 2>/dev/null; then
+        PROCESS_PIDS["nginx"]=$nginx_pid
+        PROCESS_START_TIME["nginx"]=$(date +%s)
+        log "Nginx started successfully (PID: $nginx_pid)"
+        return 0
+    else
+        log "ERROR: Failed to start nginx"
+        return 1
+    fi
+}
+
+# Function to start model server with output redirection
+start_model_server() {
+    log "Starting model server with command: $START_COMMAND"
+
+    # Start model server with output redirected to stdout (like supervisord)
+    # Using eval to handle complex commands properly
+    eval "$START_COMMAND" &
+    local model_pid=$!
+
+    # Wait for model server to be ready (similar to supervisord's startsecs=30)
+    log "Waiting for model server to be ready..."
+    for i in {1..30}; do
+        if ! kill -0 "$model_pid" 2>/dev/null; then
+            log "ERROR: Model server failed to start"
+            return 1
+        fi
+
+        # Check if server is responding (basic health check)
+        if curl -s -f "http://localhost:${SERVER_PORT}/ready" >/dev/null 2>&1; then
+            log "Model server is ready"
+            break
+        fi
+
+        if [[ $i -eq 30 ]]; then
+            log "WARNING: Model server readiness check timed out, continuing anyway"
+        fi
+
+        sleep 1
+    done
+
+    PROCESS_PIDS["model_server"]=$model_pid
+    PROCESS_START_TIME["model_server"]=$(date +%s)
+    log "Model server started successfully (PID: $model_pid)"
+    return 0
+}
+
+# Function to check if restart counter should be reset (like supervisord's startsecs behavior)
+should_reset_restart_counter() {
+    local process_name=$1
+    local current_time=$(date +%s)
+    local start_time=${PROCESS_START_TIME[$process_name]:-0}
+
+    # Reset counter if process has been running stably for RESTART_RESET_TIME seconds
+    if [[ $((current_time - start_time)) -gt $RESTART_RESET_TIME ]]; then
+        return 0
+    fi
+    return 1
+}
+
+# Function to restart a process with backoff (matching supervisord behavior)
+restart_process() {
+    local process_name=$1
+
+    # Check if we should reset the restart counter (process ran successfully for a while)
+    if should_reset_restart_counter "$process_name"; then
+        RESTART_COUNTS[$process_name]=0
+        log "Resetting restart counter for $process_name (process ran stably for $RESTART_RESET_TIME seconds)"
+    fi
+
+    local restart_count=${RESTART_COUNTS[$process_name]:-0}
+
+    if [[ $restart_count -ge $MAX_RESTART_ATTEMPTS ]]; then
+        log "ERROR: $process_name has reached max restart limit ($MAX_RESTART_ATTEMPTS)"
+        return 1
+    fi
+
+    # Use linear backoff like supervisord (not exponential)
+    local backoff_time=$((restart_count * LINEAR_BACKOFF_INTERVAL))
+
+    log "Restarting $process_name (attempt $((restart_count + 1))/$MAX_RESTART_ATTEMPTS) after ${backoff_time}s backoff"
+    sleep $backoff_time
+
+    # Increment restart count and record restart time
+    RESTART_COUNTS[$process_name]=$((restart_count + 1))
+    LAST_RESTART_TIME[$process_name]=$(date +%s)
+
+    # Restart the process
+    if [[ "$process_name" == "nginx" ]]; then
+        start_nginx
+    elif [[ "$process_name" == "model_server" ]]; then
+        start_model_server
+    fi
+
+    return $?
+}
+
+# Function to check if a process is running
+check_process() {
+    local process_name=$1
+    local pid=${PROCESS_PIDS[$process_name]:-}
+
+    if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
+        return 0  # Process is running
+    else
+        return 1  # Process is not running
+    fi
+}
+
+# Function to handle process failures (implements supervisord's PROCESS_STATE_FATAL behavior)
+handle_process_failure() {
+    local process_name=$1
+
+    if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then
+        return 0  # Don't restart if shutdown was requested
+    fi
+
+    log "WARNING: $process_name process has stopped"
+
+    # Remove from process tracking
+    unset PROCESS_PIDS[$process_name]
+
+    # Attempt to restart the process
+    if restart_process "$process_name"; then
+        log "Successfully restarted $process_name"
+        return 0
+    else
+        log "ERROR: Failed to restart $process_name after max attempts"
+
+        # Implement supervisord's PROCESS_STATE_FATAL behavior - wait before declaring fatal
+        log "Waiting $FATAL_STATE_GRACE_PERIOD seconds before declaring fatal state..."
+        sleep $FATAL_STATE_GRACE_PERIOD
+
+        # Check if shutdown was requested during grace period
+        if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then
+            return 0
+        fi
+
+        log "ERROR: $process_name has entered FATAL state (exhausted restart attempts)"
+        return 1
+    fi
+}
+
+# Cleanup function for graceful shutdown
+cleanup() {
+    log "Received shutdown signal, stopping processes..."
+    SHUTDOWN_REQUESTED=true
+
+    # Stop model server if running
+    if check_process "model_server"; then
+        local model_pid=${PROCESS_PIDS["model_server"]}
+        log "Stopping model server (PID: $model_pid)"
+        kill -TERM "$model_pid" 2>/dev/null || true
+
+        # Wait for graceful shutdown (max 30 seconds)
+        for i in {1..30}; do
+            if ! kill -0 "$model_pid" 2>/dev/null; then
+                log "Model server stopped gracefully"
+                break
+            fi
+            if [[ $i -eq 30 ]]; then
+                log "Force killing model server"
+                kill -KILL "$model_pid" 2>/dev/null || true
+            fi
+            sleep 1
+        done
+    fi
+
+    # Stop nginx if running
+    if check_process "nginx"; then
+        local nginx_pid=${PROCESS_PIDS["nginx"]}
+        log "Stopping nginx (PID: $nginx_pid)"
+        kill -TERM "$nginx_pid" 2>/dev/null || true
+
+        # Wait for graceful shutdown (max 10 seconds)
+        for i in {1..10}; do
+            if ! kill -0 "$nginx_pid" 2>/dev/null; then
+                log "Nginx stopped gracefully"
+                break
+            fi
+            if [[ $i -eq 10 ]]; then
+                log "Force killing nginx"
+                kill -KILL "$nginx_pid" 2>/dev/null || true
+            fi
+            sleep 1
+        done
+    fi
+
+    log "Shutdown complete"
+    exit 0
+}
+
+# Set up signal handlers
+trap cleanup SIGTERM SIGINT
+
+# Validate required environment variables
+if [[ -z "${START_COMMAND:-}" ]]; then
+    log "ERROR: START_COMMAND environment variable is required"
+    exit 1
+fi
+
+if [[ -z "${SERVER_PORT:-}" ]]; then
+    log "ERROR: SERVER_PORT environment variable is required"
+    exit 1
+fi
+
+log "Starting custom server wrapper"
+log "Model server command: $START_COMMAND"
+log "Server port: $SERVER_PORT"
+
+# Initialize restart counts using associative arrays (properly scoped)
+RESTART_COUNTS["nginx"]=0
+RESTART_COUNTS["model_server"]=0
+
+# Start both processes
+if ! start_nginx; then
+    log "ERROR: Failed to start nginx initially"
+    exit 1
+fi
+
+if ! start_model_server; then
+    log "ERROR: Failed to start model server initially"
+    cleanup
+    exit 1
+fi
+
+log "Both services are running, monitoring processes..."
+
+# Main monitoring loop - matches supervisord's behavior
+FATAL_STATE_REACHED=false
+
+while [[ "$FATAL_STATE_REACHED" == "false" && "$SHUTDOWN_REQUESTED" == "false" ]]; do
+    # Check nginx status
+    if ! check_process "nginx"; then
+        log "WARNING: Nginx process has stopped"
+        if ! handle_process_failure "nginx"; then
+            log "ERROR: Nginx has entered FATAL state"
+            FATAL_STATE_REACHED=true
+        fi
+    fi
+
+    # Check model server status (only if we haven't reached fatal state)
+    if [[ "$FATAL_STATE_REACHED" == "false" ]] && ! check_process "model_server"; then
+        log "WARNING: Model server process has stopped"
+        if ! handle_process_failure "model_server"; then
+            log "ERROR: Model server has entered FATAL state"
+            FATAL_STATE_REACHED=true
+        fi
+    fi
+
+    # Sleep for a short interval before checking again (like supervisord)
+    sleep 5
+done
+
+# Handle fatal state (like supervisord's PROCESS_STATE_FATAL)
+if [[ "$FATAL_STATE_REACHED" == "true" ]]; then
+    log "ERROR: One or more processes have entered FATAL state - shutting down"
+    cleanup
+    exit 1
+fi
+
+# Normal shutdown
+if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then
+    cleanup
+fi
diff --git a/truss/templates/docker_server_requirements.txt b/truss/templates/docker_server_requirements.txt
@@ -1 +1 @@
-supervisor==4.2.5
+# Shell script wrapper replaces supervisord for process management
diff --git a/truss/templates/server.Dockerfile.jinja b/truss/templates/server.Dockerfile.jinja
@@ -119,17 +119,18 @@ RUN uv python install {{ control_python_version }}
 RUN uv venv /docker_server/.venv --python {{ control_python_version }}
 RUN uv pip install --python /docker_server/.venv/bin/python -r /app/docker_server_requirements.txt --no-cache-dir
 {% set proxy_config_path = "/etc/nginx/conf.d/proxy.conf" %}
-{% set supervisor_config_path = "/etc/supervisor/supervisord.conf" %}
-{% set supervisor_server_url = "http://localhost:8080" %}
+{% set server_wrapper_path = "/docker_server/server_wrapper.sh" %}
 COPY --chown={{ default_owner }} ./proxy.conf {{ proxy_config_path }}
-COPY --chown={{ default_owner }} ./supervisord.conf {{ supervisor_config_path }}
-ENV SUPERVISOR_SERVER_URL="{{ supervisor_server_url }}"
-ENV SERVER_START_CMD="/docker_server/.venv/bin/supervisord -c {{ supervisor_config_path }}"
+COPY --chown={{ default_owner }} ./server_wrapper.sh {{ server_wrapper_path }}
+RUN chmod +x {{ server_wrapper_path }}
+ENV START_COMMAND={{ config.docker_server.start_command | tojson }}
+ENV SERVER_PORT={{ config.docker_server.server_port | tojson }}
+ENV SERVER_START_CMD={{ server_wrapper_path | tojson }}
 {#- default configuration uses port 80, which requires root privileges, so we remove it #}
 RUN rm -f /etc/nginx/sites-enabled/default
 {#- nginx writes to /var/lib/nginx, /var/log/nginx, and /run directories #}
 {{ chown_and_switch_to_regular_user_if_enabled(["/var/lib/nginx", "/var/log/nginx", "/run"]) }}
-ENTRYPOINT ["/docker_server/.venv/bin/supervisord", "-c", "{{ supervisor_config_path }}"]
+ENTRYPOINT ["{{ server_wrapper_path }}"]
 
     {%- elif requires_live_reload %} {#- elif requires_live_reload #}
 ENV HASH_TRUSS="{{ truss_hash }}"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		supervisor==4.2.5
		# Shell script wrapper replaces supervisord for process management
Copy link Collaborator bolasim Oct 10, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. since this is the only requirement, can we remove the file and the pip install portion in the dockerfile? michaelfeil reacted with thumbs up emoji