diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f8e9f53a2..954ae23ae 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -43,6 +43,9 @@ Collecting full debug bundle (optional): curl -o must-gather.sh -L https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh chmod +x must-gather.sh ./must-gather.sh + +# For extended diagnostics (includes system/PCI info): +ENABLE_EXTENDED_DIAGNOSTICS=true ./must-gather.sh ``` **NOTE**: please refer to the [must-gather](https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh) script for debug data collected. diff --git a/Makefile b/Makefile index ed3b063a1..6d7fe6808 100644 --- a/Makefile +++ b/Makefile @@ -295,6 +295,18 @@ build-image: # This includes https://github.com/openshift-psap/ci-artifacts docker-image: OUT_IMAGE ?= $(IMAGE_NAME):$(IMAGE_TAG) +##### Debug Container ##### +DEBUG_CONTAINER_IMAGE ?= ghcr.io/nvidia/gpu-operator-debug +DEBUG_CONTAINER_TAG ?= latest + +.PHONY: build-debug-container push-debug-container + +build-debug-container: + $(DOCKER) build -t $(DEBUG_CONTAINER_IMAGE):$(DEBUG_CONTAINER_TAG) hack/debug-container/ + +push-debug-container: build-debug-container + $(DOCKER) push $(DEBUG_CONTAINER_IMAGE):$(DEBUG_CONTAINER_TAG) + install-tools: @echo Installing tools from tools.go export GOBIN=$(PROJECT_DIR)/bin && \ diff --git a/hack/debug-container/Dockerfile b/hack/debug-container/Dockerfile new file mode 100644 index 000000000..945a121e3 --- /dev/null +++ b/hack/debug-container/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:22.04 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + dmidecode \ + pciutils \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["/bin/sh"] + diff --git a/hack/must-gather.sh b/hack/must-gather.sh index 841e81442..4e85988d9 100755 --- a/hack/must-gather.sh +++ b/hack/must-gather.sh @@ -3,6 +3,57 @@ set -o nounset set -x +# Set ENABLE_EXTENDED_DIAGNOSTICS=true to use a debug container for complete nvidia-bug-report collection +ENABLE_EXTENDED_DIAGNOSTICS=${ENABLE_EXTENDED_DIAGNOSTICS:-false} +DEBUG_CONTAINER_IMAGE=${DEBUG_CONTAINER_IMAGE:-ghcr.io/nvidia/gpu-operator-debug:latest} +DEBUG_TIMEOUT_SECONDS=${DEBUG_TIMEOUT_SECONDS:-60} + +# Noise patterns from kubectl debug output that should be filtered +KUBECTL_NOISE_PATTERN="^Targeting\|^Defaulting\|^Unable\|^warning:\|^All commands\|^If you don" + +# Filter out kubectl informational messages from output +filter_kubectl_noise() { + grep -v "${KUBECTL_NOISE_PATTERN}" || true +} + +# Append a section header to the bug report +append_section_header() { + local file="$1" + local title="$2" + + { + echo "" + echo "____________________________________________" + echo "" + echo "${title}" + echo "" + } >> "${file}" +} + +# Collect diagnostic output using debug container and append to bug report +# Args: $1=pod_name, $2=node_name, $3=command, $4=command_args, $5=output_file +collect_debug_diagnostic() { + local pod_name="$1" + local node_name="$2" + local cmd="$3" + local cmd_args="$4" + local output_file="$5" + + append_section_header "${output_file}" "${cmd} ${cmd_args} output (via must-gather extended diagnostics)" + + # Use -i to attach stdin (required to capture output) + if ! timeout "${DEBUG_TIMEOUT_SECONDS}" $K debug -n "${OPERATOR_NAMESPACE}" "${pod_name}" \ + --image="${DEBUG_CONTAINER_IMAGE}" \ + --target=nvidia-driver-ctr \ + --profile=sysadmin \ + -i \ + -- ${cmd} ${cmd_args} 2>/dev/null | filter_kubectl_noise >> "${output_file}"; then + echo "Warning: Failed to collect ${cmd} from ${node_name} (timed out or failed)" >&2 + echo "(collection failed or timed out after ${DEBUG_TIMEOUT_SECONDS}s)" >> "${output_file}" + fi +} + + K=kubectl if ! $K version > /dev/null; then K=oc @@ -262,18 +313,69 @@ echo "# nvidia-bug-report.sh" echo "#" echo "" +if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then + echo "===============================================================================" + echo "WARNING: Extended diagnostics enabled." + echo "" + echo "This will pull and run an external debug container (${DEBUG_CONTAINER_IMAGE})" + echo "with privileged access to collect system information (dmidecode, lspci)." + echo "" + echo "By enabling this option, you acknowledge:" + echo " - An external container image will be pulled and executed in your cluster" + echo " - The debug container requires privileged access (sysadmin profile)" + echo " - System hardware information will be collected and included in the bug report" + echo "" + echo "To disable, unset ENABLE_EXTENDED_DIAGNOSTICS or set it to false." + echo "===============================================================================" + echo "" +fi + for pod in $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-driver-daemonset -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}"); do pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}") + pod_name=$(basename "${pod}") echo "Saving nvidia-bug-report from ${pod_nodename} ..." - $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \ - (echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue) + # Collect standard nvidia-bug-report from driver container + if ! $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2; then + echo "Failed to collect nvidia-bug-report from ${pod_nodename}" + continue + fi + + # Clean up any existing temp file to avoid permission issues + rm -f /tmp/nvidia-bug-report.log.gz + + if ! $K cp "${OPERATOR_NAMESPACE}"/"${pod_name}":/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz 2>/dev/null; then + echo "Failed to save nvidia-bug-report from ${pod_nodename}" + continue + fi - $K cp "${OPERATOR_NAMESPACE}"/$(basename "${pod}"):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \ - (echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue) mv /tmp/nvidia-bug-report.log.gz "${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log.gz" + + if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then + echo "Collecting extended diagnostics (dmidecode/lspci) from ${pod_nodename}..." + + bug_report_file="${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log" + + # Decompress the bug report to append data + if ! gunzip "${bug_report_file}.gz" 2>&1; then + echo "Warning: Failed to decompress bug report for ${pod_nodename}, skipping extended diagnostics" + continue + fi + + append_section_header "${bug_report_file}" "*** EXTENDED DIAGNOSTICS (from debug container) ***" + + collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "dmidecode" "" "${bug_report_file}" + collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "lspci" "-vvv" "${bug_report_file}" + + # Recompress the bug report + if ! gzip "${bug_report_file}" 2>&1; then + echo "Warning: Failed to recompress bug report for ${pod_nodename}" + fi + else + echo "NOTE: For extended diagnostics (dmidecode/lspci), set ENABLE_EXTENDED_DIAGNOSTICS=true" + fi done echo ""