diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 1d6dbd5eeb..60a2b0ddd7 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -312,7 +312,15 @@ __configure_fbgemm_gpu_build_cuda () {
     elif  [[ $cuda_version_nvcc == *"V13.0"* ]] ||
           [[ $cuda_version_nvcc == *"V12.9"* ]] ||
           [[ $cuda_version_nvcc == *"V12.8"* ]]; then
-      local arch_list="8.0;9.0a;10.0a;12.0a"
+      # NOTE: If we reach this point, then we are building the package for
+      # publishing to PyPI
+      if [ "${PYPI_PUBLISH_CHANNEL:-}" = "release" ]; then
+        # FBGEMM non-nightly releases can be built with a different version of
+        # CUDA, which migh result in a larger binary size than what PyPI allows.
+        local arch_list="8.0;9.0a;10.0a"
+      else
+        local arch_list="8.0;9.0a;10.0a;12.0a"
+      fi
 
     elif  [[ $cuda_version_nvcc == *"V12.6"* ]] ||
           [[ $cuda_version_nvcc == *"V12.4"* ]] ||
diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
index 5d67e84bdb..0d141a8292 100644
--- a/.github/workflows/fbgemm_gpu_release_cuda.yml
+++ b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -65,6 +65,10 @@ jobs:
       matrix: ${{ needs.generate-build-matrix.outputs.matrix }}
       repo-ref: ${{ github.ref }}
       pytorch-channel-version: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch-channel-version) || 'release' }}
+      extra-env: >-
+        {
+          "PYPI_PUBLISH_CHANNEL": "release"
+        }
 
   generate-test-matrix:
     needs: build