diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index 936ef7a7ff..8893f065ea 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_%(model_name)s.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 22acd3abe9..b13b6e9edc 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h
new file mode 100644
index 0000000000..a4777347d0
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_%(processid_uppercase)s_H
+#define MG5_CONFIG_%(processid_uppercase)s_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = %(ndiagrams)d;
+
+}
+
+#endif // MG5_CONFIG_%(processid_uppercase)s_H
\ No newline at end of file
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 4c35c3eec6..d742565283 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -25,6 +25,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 0665bfb93b..dccdf2e736 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -16,6 +16,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -384,9 +447,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -462,8 +524,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -525,25 +586,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -588,16 +659,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -665,6 +764,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -674,6 +774,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -685,8 +787,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -714,7 +818,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 7de8886b1d..90ffe70624 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -86,6 +86,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -94,6 +95,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -110,6 +113,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -117,6 +121,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index 4372edde52..aa7ad1165e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -17,7 +17,7 @@
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -31,11 +31,14 @@
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -47,6 +50,7 @@
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -55,9 +59,10 @@
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -68,9 +73,15 @@
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -108,40 +119,6 @@
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -156,7 +133,8 @@
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -204,82 +182,97 @@
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV %% neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV %% neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV %% neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%%d icol=%%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -304,13 +297,7 @@
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -318,13 +305,14 @@
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index 3f8a85afa6..ef2ea6baf3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1323,7 +1323,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -1332,7 +1332,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -1451,6 +1451,7 @@ def generate_process_files(self):
         self.edit_check_sa()
         self.edit_mgonGPU()
         self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
+        self.edit_processConfig() # sub process specific, not to be symlinked from the Subprocesses directory
         self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
         self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
         self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
@@ -1543,6 +1544,17 @@ def edit_colorsum(self):
         ff = open(pjoin(self.path, 'color_sum.cc'),'w')
         ff.write(template % replace_dict)
         ff.close()
+        
+    def edit_processConfig(self):
+        """Generate process_config.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_processConfig')
+        template = open(pjoin(self.template_path,'gpu','processConfig.h'),'r').read()
+        replace_dict = {}
+        replace_dict['ndiagrams'] = len(self.matrix_elements[0].get('diagrams'))
+        replace_dict['processid_uppercase'] = self.get_process_name().upper()
+        ff = open(pjoin(self.path, 'processConfig.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
 
     def generate_subprocess_directory_end(self, **opt):
         """ opt contain all local variable of the fortran original function"""
@@ -1926,7 +1938,8 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -1939,7 +1952,7 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -1948,12 +1961,8 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif""")
         diagrams = matrix_element.get('diagrams')
@@ -1985,8 +1994,12 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
                         ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472
                         ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472
                         res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
-                        res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diagram.get('number'))
-                        res.append("if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );")
+                        diagnum = diagram.get('number')
+                        res.append("if( storeChannelWeights )")
+                        res.append("{")
+                        res.append("  numerators_sv[%i] += cxabs2( amp_sv[0] );" % (diagnum-1))
+                        res.append("  denominators_sv += cxabs2( amp_sv[0] );")
+                        res.append("}")
                         res.append("#endif")
                 else:
                     res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index e54290d5a7..6669d53123 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -123,6 +123,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                                       s+'gpu/testmisc.cc', s+'gpu/testxxx_cc_ref.txt', s+'gpu/valgrind.h',
                                       s+'gpu/perf.py', s+'gpu/profile.sh',
                                       s+'gpu/cudacpp_overlay.mk', s+'gpu/makefile_wrapper.mk',
+                                      s+'gpu/umami.h', s+'gpu/umami.cc',
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
                      'test': [s+'gpu/cudacpp_test.mk']}
 
@@ -151,6 +152,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                     'testxxx.cc', # this is generated from a template in Subprocesses but we still link it in P1
                     'MemoryBuffers.h', # this is generated from a template in Subprocesses but we still link it in P1
                     'MemoryAccessCouplings.h', # this is generated from a template in Subprocesses but we still link it in P1
+                    'umami.h', 'umami.cc',
                     'perf.py', 'profile.sh']
 
     # AV - use template files from PLUGINDIR instead of MG5DIR and change their names
@@ -264,7 +266,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
 done"""
             try:
                 result = subprocess.run(
-                    ["bash", "-lc", patch_coupl_write],
+                    ["bash", "-c", patch_coupl_write],
                     cwd=pjoin(self.dir_path, "Source", "MODEL"),
                     text=True,
                     capture_output=True,
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index b7cdf09c17..cac2fc9257 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004445075988769531 [0m
+[1;32mDEBUG: model prefixing  takes 0.0017848014831542969 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -149,21 +150,21 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.003 s
+1 processes with 2 diagrams generated in 0.007 s
 Total: 1 processes with 2 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -175,22 +176,22 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.060 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s
+Wrote files for 8 helas calls in 1.876 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.170 s
+ALOHA: aloha creates 3 routines in  0.129 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.184 s
+ALOHA: aloha creates 7 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -199,32 +200,34 @@ ALOHA: aloha creates 7 routines in  0.184 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.396s
-user	0m1.798s
-sys	0m0.425s
-Code generation completed in 2 seconds
+real	0m9.417s
+user	0m1.278s
+sys	0m0.670s
+Code generation completed in 10 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -245,12 +248,14 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
+/shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/madgraph/various/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 quit
 INFO:  
 launch in debug mode
@@ -274,9 +279,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 2450ec54f8..a536b3d076 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -391,8 +452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -404,8 +468,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -719,9 +786,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -797,8 +863,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -860,25 +925,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -923,16 +998,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1000,6 +1103,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1009,6 +1113,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1020,8 +1126,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1049,7 +1157,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1065,7 +1172,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1079,11 +1186,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1095,6 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1103,9 +1214,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1116,9 +1228,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1156,40 +1274,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1204,7 +1288,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1252,82 +1337,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1352,13 +1452,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1366,13 +1460,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index c9d280d0f6..3ca62dbc6e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h
new file mode 100644
index 0000000000..65cfee8266
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H
+#define MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 2;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 3c991f09cf..6cd2239516 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -1,8 +1,8 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
 Running MG5 in debug mode
-('WARNING: loading of madgraph too slow!!!', 1.185530662536621)
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 ************************************************************
 *                                                          *
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004302024841308594 [0m
+[1;32mDEBUG: model prefixing  takes 0.001810312271118164 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,32 +150,32 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.003 s
+1 processes with 2 diagrams generated in 0.006 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.190 s
+ALOHA: aloha creates 4 routines in  0.122 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in  0.190 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m1.709s
-user	0m1.562s
-sys	0m0.115s
-Code generation completed in 2 seconds
+real	0m2.084s
+user	0m0.455s
+sys	0m0.171s
+Code generation completed in 3 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index af61f3ea74..ec11e4c04b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -717,9 +778,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -795,8 +855,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -858,25 +917,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -921,16 +990,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -998,6 +1095,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1007,6 +1105,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1018,8 +1118,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1047,7 +1149,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1063,7 +1164,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1077,11 +1178,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1093,6 +1197,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1101,9 +1206,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1114,9 +1220,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1154,40 +1266,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1202,7 +1280,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1250,82 +1329,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1350,13 +1444,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1364,13 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index c9d280d0f6..3ca62dbc6e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h
new file mode 100644
index 0000000000..65cfee8266
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H
+#define MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 2;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 156f7ce8e7..b485abf77b 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004584789276123047 [0m
+[1;32mDEBUG: model prefixing  takes 0.0018193721771240234 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.007 s
+1 processes with 3 diagrams generated in 0.015 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -176,49 +177,51 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1589][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.009 s
-Wrote files for 10 helas calls in 0.078 s
+Wrote files for 10 helas calls in 1.922 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.103 s
+ALOHA: aloha creates 2 routines in  0.102 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.088 s
+ALOHA: aloha creates 4 routines in  0.069 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.028s
-user	0m1.664s
-sys	0m0.358s
-Code generation completed in 2 seconds
+real	0m8.946s
+user	0m1.272s
+sys	0m0.626s
+Code generation completed in 9 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -239,9 +242,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -268,9 +271,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 55167ebaf3..4204e595d8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -391,8 +452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -405,8 +469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -418,8 +485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -732,9 +802,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -810,8 +879,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -873,25 +941,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -936,16 +1014,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1013,6 +1119,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1022,6 +1129,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1033,8 +1142,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1062,7 +1173,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1078,7 +1188,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1092,11 +1202,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1108,6 +1221,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1116,9 +1230,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1129,9 +1244,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1169,40 +1290,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1217,7 +1304,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1265,82 +1353,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1365,13 +1468,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1379,13 +1476,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 26652fc739..5fdf36bb26 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h
new file mode 100644
index 0000000000..8f6a27f796
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 3;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 1f90d3c408..d5886a1099 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004430294036865234 [0m
+[1;32mDEBUG: model prefixing  takes 0.0018737316131591797 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,45 +151,45 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.006 s
+1 processes with 3 diagrams generated in 0.011 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.121 s
+ALOHA: aloha creates 2 routines in  0.065 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.508s
-user	0m0.439s
-sys	0m0.064s
-Code generation completed in 1 seconds
+real	0m1.735s
+user	0m0.348s
+sys	0m0.112s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 2b06bb84d0..bbc2c6c17c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -729,9 +790,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -807,8 +867,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -870,25 +929,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -933,16 +1002,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1010,6 +1107,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1019,6 +1117,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1030,8 +1130,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1059,7 +1161,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1075,7 +1176,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1089,11 +1190,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1105,6 +1209,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1113,9 +1218,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1126,9 +1232,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1166,40 +1278,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1214,7 +1292,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1262,82 +1341,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1362,13 +1456,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1376,13 +1464,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 26652fc739..5fdf36bb26 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h
new file mode 100644
index 0000000000..8f6a27f796
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 3;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 0af9646028..c6e2bc2275 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -1,8 +1,8 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
 Running MG5 in debug mode
-('WARNING: loading of madgraph too slow!!!', 0.5061478614807129)
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 ************************************************************
 *                                                          *
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.01866316795349121 [0m
+[1;32mDEBUG: model prefixing  takes 0.00186920166015625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.010 s
+1 processes with 3 diagrams generated in 0.012 s
 Total: 1 processes with 3 diagrams
 add process g g > t t~ g
 INFO: Checking for minimal orders which gives processes. 
@@ -159,21 +159,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.023 s
+1 processes with 16 diagrams generated in 0.025 s
 Total: 2 processes with 19 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Processing color information for process: g g > t t~ g @2 
@@ -187,9 +187,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -198,25 +198,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.088 s
-Wrote files for 46 helas calls in 0.403 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s
+Wrote files for 46 helas calls in 4.525 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.419 s
+ALOHA: aloha creates 5 routines in  0.191 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.553 s
+ALOHA: aloha creates 10 routines in  0.166 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -226,32 +226,34 @@ ALOHA: aloha creates 10 routines in  0.553 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.986s
-user	0m4.846s
-sys	0m0.948s
-Code generation completed in 6 seconds
+real	0m12.237s
+user	0m1.681s
+sys	0m0.791s
+Code generation completed in 12 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -272,9 +274,9 @@ Code generation completed in 6 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -301,9 +303,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 55167ebaf3..4204e595d8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -391,8 +452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -405,8 +469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -418,8 +485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -732,9 +802,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -810,8 +879,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -873,25 +941,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -936,16 +1014,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1013,6 +1119,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1022,6 +1129,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1033,8 +1142,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1062,7 +1173,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1078,7 +1188,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1092,11 +1202,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1108,6 +1221,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1116,9 +1230,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1129,9 +1244,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1169,40 +1290,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1217,7 +1304,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1265,82 +1353,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1365,13 +1468,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1379,13 +1476,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 26652fc739..5fdf36bb26 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h
new file mode 100644
index 0000000000..8f6a27f796
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 3;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index f6e0894592..e32a6ef9b4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -394,8 +455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
@@ -410,8 +474,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -424,8 +491,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -439,8 +509,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -452,8 +525,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -466,8 +542,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -480,8 +559,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
 
@@ -493,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -507,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
 
@@ -520,8 +608,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -534,8 +625,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -548,8 +642,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -564,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
 
@@ -577,8 +677,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
 
@@ -590,8 +693,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -949,9 +1055,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1027,8 +1132,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1090,25 +1194,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1153,16 +1267,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1230,6 +1372,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1239,6 +1382,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1250,8 +1395,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1279,7 +1426,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1295,7 +1441,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1309,11 +1455,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1325,6 +1474,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1333,9 +1483,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1346,9 +1497,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1386,40 +1543,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1434,7 +1557,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1482,82 +1606,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1582,13 +1721,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1596,13 +1729,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index a0fbcbb773..a49500a023 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h
new file mode 100644
index 0000000000..47044dbe6a
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 16;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index e50d05daa6..d7b2672731 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004053354263305664 [0m
+[1;32mDEBUG: model prefixing  takes 0.0017828941345214844 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.016 s
+1 processes with 16 diagrams generated in 0.024 s
 Total: 1 processes with 16 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -176,25 +177,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.030 s
-Wrote files for 36 helas calls in 0.096 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.016 s
+Wrote files for 36 helas calls in 2.297 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.242 s
+ALOHA: aloha creates 5 routines in  0.185 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.216 s
+ALOHA: aloha creates 10 routines in  0.172 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,32 +205,34 @@ ALOHA: aloha creates 10 routines in  0.216 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.399s
-user	0m2.037s
-sys	0m0.357s
-Code generation completed in 3 seconds
+real	0m9.939s
+user	0m1.538s
+sys	0m0.629s
+Code generation completed in 10 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -250,9 +253,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -279,9 +282,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 7f29af7755..b8f69df605 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -394,8 +455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
@@ -410,8 +474,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -424,8 +491,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -439,8 +509,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -452,8 +525,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -466,8 +542,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -480,8 +559,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
 
@@ -493,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -507,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
 
@@ -520,8 +608,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -534,8 +625,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -548,8 +642,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -564,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
 
@@ -577,8 +677,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
 
@@ -590,8 +693,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -949,9 +1055,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1027,8 +1132,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1090,25 +1194,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1153,16 +1267,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1230,6 +1372,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1239,6 +1382,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1250,8 +1395,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1279,7 +1426,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1295,7 +1441,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1309,11 +1455,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1325,6 +1474,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1333,9 +1483,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1346,9 +1497,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1386,40 +1543,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1434,7 +1557,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1482,82 +1606,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1582,13 +1721,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1596,13 +1729,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 1b49cac30b..1b956214b7 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h
new file mode 100644
index 0000000000..47044dbe6a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 16;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index ab60b4e5bd..c635672d98 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0042188167572021484 [0m
+[1;32mDEBUG: model prefixing  takes 0.0044193267822265625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.017 s
+1 processes with 16 diagrams generated in 0.025 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.015 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.230 s
+ALOHA: aloha creates 5 routines in  0.161 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -186,17 +187,17 @@ ALOHA: aloha creates 5 routines in  0.230 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.642s
-user	0m0.586s
-sys	0m0.050s
-Code generation completed in 1 seconds
+real	0m2.175s
+user	0m0.523s
+sys	0m0.144s
+Code generation completed in 3 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 3897ffd9b4..45ea024451 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -943,9 +1004,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1021,8 +1081,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1084,25 +1143,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1147,16 +1216,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1224,6 +1321,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1233,6 +1331,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1244,8 +1344,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1273,7 +1375,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1289,7 +1390,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1303,11 +1404,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1319,6 +1423,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1327,9 +1432,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1340,9 +1446,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1380,40 +1492,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1428,7 +1506,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1476,82 +1555,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1576,13 +1670,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1590,13 +1678,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 1b49cac30b..1b956214b7 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h
new file mode 100644
index 0000000000..47044dbe6a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 16;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 8c941153c6..b697a4a0e0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004433155059814453 [0m
+[1;32mDEBUG: model prefixing  takes 0.0018012523651123047 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.125 s
+1 processes with 123 diagrams generated in 0.085 s
 Total: 1 processes with 123 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -176,25 +177,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.307 s
-Wrote files for 222 helas calls in 0.475 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.154 s
+Wrote files for 222 helas calls in 3.038 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.280 s
+ALOHA: aloha creates 5 routines in  0.194 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.246 s
+ALOHA: aloha creates 10 routines in  0.167 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -207,32 +208,34 @@ ALOHA: aloha creates 10 routines in  0.246 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.426s
-user	0m3.041s
-sys	0m0.376s
-Code generation completed in 4 seconds
+real	0m10.227s
+user	0m2.050s
+sys	0m0.643s
+Code generation completed in 10 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -253,9 +256,9 @@ Code generation completed in 4 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -282,9 +285,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 6664e7c6fc..ca0360d110 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -430,8 +491,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -450,8 +514,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -470,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -491,8 +561,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -505,8 +578,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -521,8 +597,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -535,8 +614,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -549,8 +631,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -565,8 +650,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -579,8 +667,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -593,8 +684,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -609,8 +703,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -623,8 +720,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -639,8 +739,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -655,8 +758,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -673,8 +779,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
 
@@ -686,8 +795,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
 
@@ -699,8 +811,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -714,8 +829,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -730,8 +848,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -744,8 +865,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -758,8 +882,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -774,8 +901,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -788,8 +918,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -802,8 +935,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -815,8 +951,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
 
@@ -828,8 +967,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -841,8 +983,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
 
@@ -854,8 +999,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -868,8 +1016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -910,8 +1061,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] -= amp_sv[0];
 
@@ -923,8 +1077,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 34
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[33] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= amp_sv[0];
 
@@ -936,8 +1093,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -950,8 +1110,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -966,8 +1129,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 37
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[36] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -980,8 +1146,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 38
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[37] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -994,8 +1163,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 39
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[38] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -1010,8 +1182,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 40
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[39] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1024,8 +1199,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 41
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[40] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1038,8 +1216,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 42
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[41] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] -= amp_sv[0];
 
@@ -1051,8 +1232,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 43
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[42] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] -= amp_sv[0];
 
@@ -1064,8 +1248,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 44
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[43] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] -= amp_sv[0];
 
@@ -1077,8 +1264,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 45
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[44] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] -= amp_sv[0];
 
@@ -1090,8 +1280,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 46
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[45] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1104,8 +1297,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 47
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[46] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -1143,8 +1339,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 49
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[48] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1157,8 +1356,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 50
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[49] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -1173,8 +1375,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 51
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[50] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1187,8 +1392,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 52
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[51] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1201,8 +1409,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 53
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[52] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -1217,8 +1428,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 54
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[53] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1231,8 +1445,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 55
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[54] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -1247,8 +1464,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 56
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[55] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -1263,8 +1483,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 57
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[56] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1317,8 +1540,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 59
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[58] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1337,8 +1563,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 60
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[59] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1357,8 +1586,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 61
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[60] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -1373,8 +1605,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 62
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[61] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1387,8 +1622,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 63
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[62] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -1403,8 +1641,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 64
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[63] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1418,8 +1659,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 65
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[64] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1432,8 +1676,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 66
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[65] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -1448,8 +1695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 67
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[66] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1462,8 +1712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 68
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[67] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1476,8 +1729,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 69
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[68] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -1492,8 +1748,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 70
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[69] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1506,8 +1765,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 71
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[70] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -1522,8 +1784,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 72
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[71] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
@@ -1538,8 +1803,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 73
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[72] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1592,8 +1860,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 75
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[74] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1612,8 +1883,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 76
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[75] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1632,8 +1906,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 77
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[76] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -1648,8 +1925,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 78
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[77] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1662,8 +1942,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 79
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[78] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -1678,8 +1961,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 80
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[79] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1692,8 +1978,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 81
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[80] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= amp_sv[0];
 
@@ -1705,8 +1994,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 82
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[81] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= amp_sv[0];
 
@@ -1718,8 +2010,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 83
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[82] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= amp_sv[0];
 
@@ -1731,8 +2026,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 84
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[83] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= amp_sv[0];
 
@@ -1744,8 +2042,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 85
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[84] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1758,8 +2059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 86
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[85] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -1774,8 +2078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 87
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[86] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= amp_sv[0];
 
@@ -1787,8 +2094,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 88
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[87] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] -= amp_sv[0];
 
@@ -1800,8 +2110,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 89
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[88] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] -= amp_sv[0];
 
@@ -1813,8 +2126,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 90
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[89] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] -= amp_sv[0];
 
@@ -1826,8 +2142,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 91
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[90] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1840,8 +2159,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 92
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[91] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -1890,8 +2212,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 94
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[93] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1910,8 +2235,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 95
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[94] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1930,8 +2258,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 96
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[95] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -1946,8 +2277,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 97
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[96] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1960,8 +2294,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 98
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[97] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -1976,8 +2313,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 99
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[98] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2024,8 +2364,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 101
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[100] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2044,8 +2387,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 102
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[101] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2064,8 +2410,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 103
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[102] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -2080,8 +2429,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 104
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[103] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2094,8 +2446,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 105
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[104] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -2110,8 +2465,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 106
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[105] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2158,8 +2516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 108
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[107] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2178,8 +2539,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 109
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[108] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2198,8 +2562,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 110
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[109] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= amp_sv[0];
 
@@ -2211,8 +2578,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 111
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[110] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] -= amp_sv[0];
 
@@ -2224,8 +2594,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 112
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[111] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= amp_sv[0];
 
@@ -2237,8 +2610,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 113
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[112] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] -= amp_sv[0];
 
@@ -2877,9 +3253,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -2955,8 +3330,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -3018,25 +3392,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -3081,16 +3465,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -3158,6 +3570,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -3167,6 +3580,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -3178,8 +3593,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -3207,7 +3624,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -3223,7 +3639,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -3237,11 +3653,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -3253,6 +3672,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -3261,9 +3681,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -3274,9 +3695,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3314,40 +3741,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -3362,7 +3755,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -3410,82 +3804,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -3510,13 +3919,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -3524,13 +3927,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index 7b57d7c763..e3c578f5e0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h
new file mode 100644
index 0000000000..4f4a3c3bc0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 123;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 691a9d08c7..2cc4b19c4e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004384040832519531 [0m
+[1;32mDEBUG: model prefixing  takes 0.001790761947631836 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.118 s
+1 processes with 123 diagrams generated in 0.083 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.366 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.152 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.231 s
+ALOHA: aloha creates 5 routines in  0.173 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in  0.231 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.208s
-user	0m1.150s
-sys	0m0.049s
-Code generation completed in 2 seconds
+real	0m2.439s
+user	0m0.757s
+sys	0m0.131s
+Code generation completed in 3 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index d43252d697..d9dc04eb8b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -2934,9 +2995,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -3012,8 +3072,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -3075,25 +3134,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -3138,16 +3207,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -3215,6 +3312,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -3224,6 +3322,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -3235,8 +3335,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -3264,7 +3366,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -3280,7 +3381,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -3294,11 +3395,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -3310,6 +3414,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -3318,9 +3423,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -3331,9 +3437,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3371,40 +3483,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -3419,7 +3497,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -3467,82 +3546,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -3567,13 +3661,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -3581,13 +3669,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index 7b57d7c763..e3c578f5e0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h
new file mode 100644
index 0000000000..4f4a3c3bc0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 123;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 5908592d13..da1647639c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0061588287353515625 [0m
+[1;32mDEBUG: model prefixing  takes 0.0017483234405517578 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,27 +151,27 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.427 s
+1 processes with 1240 diagrams generated in 0.713 s
 Total: 1 processes with 1240 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
+INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
@@ -178,25 +179,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s
-Wrote files for 2281 helas calls in 17.935 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 2.518 s
+Wrote files for 2281 helas calls in 34.890 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.379 s
+ALOHA: aloha creates 5 routines in  0.190 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.232 s
+ALOHA: aloha creates 10 routines in  0.181 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -209,32 +210,34 @@ ALOHA: aloha creates 10 routines in  0.232 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m31.040s
-user	0m30.219s
-sys	0m0.591s
-Code generation completed in 31 seconds
+real	0m46.871s
+user	0m13.972s
+sys	0m1.182s
+Code generation completed in 46 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -255,9 +258,9 @@ Code generation completed in 31 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -284,9 +287,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index 85e7f8f09c..e9c80d8364 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -428,8 +492,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -515,8 +582,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -543,8 +613,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -629,8 +702,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
@@ -657,8 +733,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -981,8 +1060,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -1009,8 +1091,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -1037,8 +1122,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -1123,8 +1211,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -1151,8 +1242,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -1179,8 +1273,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -1265,8 +1362,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -1293,8 +1393,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -1321,8 +1424,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -1411,8 +1517,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1425,8 +1534,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1439,8 +1551,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1459,8 +1574,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
@@ -1475,8 +1593,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1495,8 +1616,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
@@ -1546,8 +1670,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1560,8 +1687,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 34
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[33] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1574,8 +1704,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += amp_sv[0];
       jamp_sv[55] -= amp_sv[0];
@@ -1590,8 +1723,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1604,8 +1740,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 37
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[36] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1618,8 +1757,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 38
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[37] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += amp_sv[0];
       jamp_sv[54] -= amp_sv[0];
@@ -1634,8 +1776,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 39
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[38] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += amp_sv[0];
       jamp_sv[49] -= amp_sv[0];
@@ -1650,8 +1795,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 40
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[39] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
@@ -1666,8 +1814,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 41
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[40] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1687,8 +1838,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 42
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[41] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1701,8 +1855,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 43
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[42] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1715,8 +1872,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 44
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[43] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1735,8 +1895,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 45
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[44] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
@@ -1751,8 +1914,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 46
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[45] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1771,8 +1937,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 47
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[46] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[84] += amp_sv[0];
       jamp_sv[86] -= amp_sv[0];
@@ -1822,8 +1991,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 49
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[48] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1836,8 +2008,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 50
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[49] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1850,8 +2025,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 51
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[50] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += amp_sv[0];
       jamp_sv[79] -= amp_sv[0];
@@ -1866,8 +2044,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 52
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[51] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1880,8 +2061,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 53
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[52] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1894,8 +2078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 54
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[53] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += amp_sv[0];
       jamp_sv[78] -= amp_sv[0];
@@ -1910,8 +2097,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 55
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[54] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += amp_sv[0];
       jamp_sv[73] -= amp_sv[0];
@@ -1926,8 +2116,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 56
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[55] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[88] += amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
@@ -1942,8 +2135,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 57
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[56] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1963,8 +2159,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 58
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[57] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1977,8 +2176,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 59
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[58] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1991,8 +2193,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 60
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[59] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2011,8 +2216,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 61
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[60] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[114] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
@@ -2027,8 +2235,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 62
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[61] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2047,8 +2258,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 63
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[62] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[108] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
@@ -2097,8 +2311,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 65
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[64] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2111,8 +2328,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 66
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[65] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2125,8 +2345,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 67
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[66] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
@@ -2141,8 +2364,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 68
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[67] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2155,8 +2381,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 69
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[68] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2169,8 +2398,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 70
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[69] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
@@ -2185,8 +2417,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 71
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[70] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
@@ -2201,8 +2436,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 72
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[71] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[112] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
@@ -2217,8 +2455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 73
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[72] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2238,8 +2479,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 74
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[73] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2252,8 +2496,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 75
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[74] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2266,8 +2513,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 76
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[75] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2286,8 +2536,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 77
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[76] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -2302,8 +2555,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 78
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[77] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2322,8 +2578,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 79
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[78] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
@@ -2372,8 +2631,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 81
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[80] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -2388,8 +2650,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 82
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[81] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
@@ -2404,8 +2669,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 83
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[82] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2424,8 +2692,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 84
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[83] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2438,8 +2709,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 85
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[84] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2452,8 +2726,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 86
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[85] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2472,8 +2749,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 87
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[86] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -2488,8 +2768,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 88
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[87] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2508,8 +2791,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 89
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[88] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -2558,8 +2844,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 91
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[90] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -2574,8 +2863,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 92
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[91] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
@@ -2590,8 +2882,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 93
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[92] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2610,8 +2905,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 94
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[93] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2624,8 +2922,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 95
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[94] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2638,8 +2939,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 96
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[95] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2658,8 +2962,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 97
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[96] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -2674,8 +2981,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 98
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[97] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2694,8 +3004,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 99
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[98] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -2744,8 +3057,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 101
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[100] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -2760,8 +3076,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 102
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[101] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
@@ -2776,8 +3095,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 103
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[102] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2796,8 +3118,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 104
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[103] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -2812,8 +3137,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 105
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[104] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2832,8 +3160,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 106
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[105] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
@@ -2848,8 +3179,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 107
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[106] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2868,8 +3202,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 108
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[107] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2888,8 +3225,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 109
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[108] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2908,8 +3248,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 110
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[109] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -2924,8 +3267,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 111
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[110] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2944,8 +3290,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 112
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[111] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[70] += amp_sv[0];
       jamp_sv[71] -= amp_sv[0];
@@ -2960,8 +3309,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 113
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[112] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2980,8 +3332,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 114
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[113] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3000,8 +3355,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 115
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[114] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3020,8 +3378,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 116
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[115] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -3036,8 +3397,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 117
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[116] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3056,8 +3420,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 118
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[117] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[94] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
@@ -3072,8 +3439,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 119
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[118] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3092,8 +3462,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 120
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[119] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3112,8 +3485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 121
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[120] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3203,8 +3579,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 124
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[123] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] -= amp_sv[0];
 
@@ -3216,8 +3595,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 125
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[124] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= amp_sv[0];
 
@@ -3230,9 +3612,12 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 126
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
+      if( storeChannelWeights )
+      {
+        numerators_sv[125] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
+#endif
       jamp_sv[17] -= amp_sv[0];
 
       // *** DIAGRAM 127 OF 1240 ***
@@ -3243,8 +3628,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 127
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[126] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] -= amp_sv[0];
 
@@ -3256,8 +3644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 128
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[127] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] -= amp_sv[0];
 
@@ -3269,8 +3660,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 129
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[128] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] -= amp_sv[0];
 
@@ -3282,8 +3676,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 130
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[129] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -3298,8 +3695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 131
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[130] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3312,8 +3712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 132
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[131] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3326,8 +3729,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 133
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[132] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -3342,8 +3748,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 134
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[133] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3356,8 +3765,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 135
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[134] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3370,8 +3782,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 136
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[135] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -3386,8 +3801,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 137
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[136] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3400,8 +3818,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 138
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[137] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3438,8 +3859,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 140
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[139] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3458,8 +3882,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 141
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[140] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3512,8 +3939,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 143
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[142] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3526,8 +3956,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 144
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[143] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -3542,8 +3975,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 145
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[144] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3556,8 +3992,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 146
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[145] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -3572,8 +4011,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 147
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[146] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3586,8 +4028,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 148
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[147] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -3602,8 +4047,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 149
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[148] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3616,8 +4064,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 150
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[149] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3630,8 +4081,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 151
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[150] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -3646,8 +4100,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 152
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[151] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3660,8 +4117,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 153
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[152] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -3676,8 +4136,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 154
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[153] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3696,8 +4159,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 155
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[154] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -3713,8 +4179,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 156
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[155] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3733,8 +4202,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 157
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[156] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3787,8 +4259,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 159
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[158] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3801,8 +4276,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 160
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[159] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -3817,8 +4295,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 161
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[160] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3831,8 +4312,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 162
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[161] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -3847,8 +4331,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 163
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[162] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3861,8 +4348,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 164
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[163] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -3877,8 +4367,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 165
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[164] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3891,8 +4384,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 166
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[165] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3905,8 +4401,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 167
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[166] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -3921,8 +4420,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 168
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[167] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3935,8 +4437,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 169
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[168] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -3951,8 +4456,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 170
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[169] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3971,8 +4479,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 171
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[170] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -3988,8 +4499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 172
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[171] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4008,8 +4522,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 173
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[172] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4062,8 +4579,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 175
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[174] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4076,8 +4596,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 176
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[175] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -4092,8 +4615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 177
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[176] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4106,8 +4632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 178
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[177] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -4122,8 +4651,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 179
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[178] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4136,8 +4668,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 180
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[179] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -4152,8 +4687,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 181
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[180] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4166,8 +4704,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 182
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[181] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4180,8 +4721,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 183
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[182] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -4196,8 +4740,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 184
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[183] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4210,8 +4757,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 185
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[184] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -4226,8 +4776,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 186
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[185] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4246,8 +4799,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 187
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[186] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
@@ -4262,8 +4818,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 188
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[187] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
 
@@ -4275,8 +4834,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 189
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[188] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
 
@@ -4288,8 +4850,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 190
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[189] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= amp_sv[0];
 
@@ -4301,8 +4866,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 191
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[190] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] -= amp_sv[0];
 
@@ -4314,8 +4882,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 192
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[191] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] -= amp_sv[0];
 
@@ -4327,8 +4898,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 193
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[192] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] -= amp_sv[0];
 
@@ -4340,8 +4914,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 194
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[193] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4354,8 +4931,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 195
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[194] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -4370,8 +4950,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 196
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[195] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4384,8 +4967,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 197
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[196] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
 
@@ -4397,8 +4983,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 198
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[197] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -4410,8 +4999,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 199
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[198] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= amp_sv[0];
 
@@ -4423,8 +5015,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 200
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[199] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= amp_sv[0];
 
@@ -4436,8 +5031,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 201
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[200] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] -= amp_sv[0];
 
@@ -4449,8 +5047,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 202
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[201] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= amp_sv[0];
 
@@ -4462,8 +5063,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 203
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[202] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4476,8 +5080,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 204
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[203] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -4492,8 +5099,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 205
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[204] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4506,8 +5116,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 206
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[205] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
 
@@ -4519,8 +5132,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 207
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[206] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -4532,8 +5148,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 208
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[207] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= amp_sv[0];
 
@@ -4545,8 +5164,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 209
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[208] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= amp_sv[0];
 
@@ -4558,8 +5180,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 210
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[209] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] -= amp_sv[0];
 
@@ -4571,8 +5196,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 211
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[210] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= amp_sv[0];
 
@@ -4584,8 +5212,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 212
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[211] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4598,8 +5229,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 213
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[212] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -4614,8 +5248,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 214
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[213] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4628,8 +5265,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 215
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[214] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4642,8 +5282,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 216
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[215] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -4658,8 +5301,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 217
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[216] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4678,8 +5324,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 218
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[217] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4732,8 +5381,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 220
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[219] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -4748,8 +5400,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 221
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[220] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4762,8 +5417,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 222
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[221] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4776,8 +5434,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 223
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[222] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -4792,8 +5453,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 224
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[223] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4812,8 +5476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 225
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[224] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4866,8 +5533,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 227
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[226] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -4882,8 +5552,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 228
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[227] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4896,8 +5569,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 229
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[228] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4910,8 +5586,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 230
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[229] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -4926,8 +5605,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 231
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[230] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4946,8 +5628,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 232
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[231] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5000,8 +5685,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 234
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[233] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -5016,8 +5704,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 235
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[234] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5329,8 +6020,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 247
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[246] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] -= amp_sv[0];
 
@@ -5342,8 +6036,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 248
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[247] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[47] -= amp_sv[0];
 
@@ -5356,8 +6053,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 249
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[248] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] -= amp_sv[0];
 
@@ -5369,8 +6069,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 250
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[249] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] -= amp_sv[0];
 
@@ -5382,8 +6085,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 251
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[250] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] -= amp_sv[0];
 
@@ -5395,8 +6101,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 252
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[251] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] -= amp_sv[0];
 
@@ -5408,8 +6117,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 253
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[252] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
@@ -5424,8 +6136,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 254
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[253] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5438,8 +6153,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 255
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[254] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5452,8 +6170,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 256
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[255] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
@@ -5468,8 +6189,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 257
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[256] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5482,8 +6206,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 258
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[257] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5496,8 +6223,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 259
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[258] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += amp_sv[0];
       jamp_sv[35] -= amp_sv[0];
@@ -5512,8 +6242,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 260
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[259] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5526,8 +6259,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 261
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[260] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5562,8 +6298,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 263
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[262] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5582,8 +6321,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 264
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[263] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5636,8 +6378,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 266
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[265] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5650,8 +6395,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 267
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[266] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] += amp_sv[0];
       jamp_sv[59] -= amp_sv[0];
@@ -5666,8 +6414,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 268
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[267] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5680,8 +6431,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 269
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[268] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
@@ -5696,8 +6450,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 270
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[269] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5710,8 +6467,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 271
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[270] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[81] += amp_sv[0];
       jamp_sv[87] -= amp_sv[0];
@@ -5726,8 +6486,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 272
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[271] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5740,8 +6503,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 273
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[272] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5754,8 +6520,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 274
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[273] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[105] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
@@ -5770,8 +6539,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 275
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[274] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5784,8 +6556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 276
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[275] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[93] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
@@ -5800,8 +6575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 277
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[276] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5820,8 +6598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 278
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[277] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += amp_sv[0];
       jamp_sv[35] -= amp_sv[0];
@@ -5836,8 +6617,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 279
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[278] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5856,8 +6640,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 280
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[279] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5910,8 +6697,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 282
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[281] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5924,8 +6714,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 283
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[282] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] += amp_sv[0];
       jamp_sv[83] -= amp_sv[0];
@@ -5940,8 +6733,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 284
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[283] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5954,8 +6750,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 285
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[284] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] += amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
@@ -5970,8 +6769,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 286
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[285] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5984,8 +6786,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 287
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[286] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[57] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
@@ -6000,8 +6805,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 288
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[287] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6014,8 +6822,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 289
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[288] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6028,8 +6839,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 290
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[289] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[107] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
@@ -6044,8 +6858,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 291
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[290] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6058,8 +6875,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 292
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[291] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[69] += amp_sv[0];
       jamp_sv[71] -= amp_sv[0];
@@ -6074,8 +6894,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 293
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[292] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6094,8 +6917,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 294
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[293] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[39] += amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
@@ -6110,8 +6936,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 295
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[294] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6130,8 +6959,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 296
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[295] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6184,8 +7016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 298
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[297] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6198,8 +7033,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 299
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[298] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[47] += amp_sv[0];
       jamp_sv[83] -= amp_sv[0];
@@ -6214,8 +7052,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 300
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[299] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6228,8 +7069,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 301
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[300] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] += amp_sv[0];
       jamp_sv[59] -= amp_sv[0];
@@ -6244,8 +7088,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 302
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[301] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6258,8 +7105,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 303
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[302] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[59] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
@@ -6274,8 +7124,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 304
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[303] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6288,8 +7141,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 305
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[304] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6302,8 +7158,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 306
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[305] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[83] += amp_sv[0];
       jamp_sv[87] -= amp_sv[0];
@@ -6318,8 +7177,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 307
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[306] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6332,8 +7194,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 308
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[307] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[63] += amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
@@ -6348,8 +7213,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 309
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[308] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6368,8 +7236,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 310
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[309] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[45] += amp_sv[0];
       jamp_sv[47] -= amp_sv[0];
@@ -6384,8 +7255,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 311
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[310] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[65] -= amp_sv[0];
 
@@ -6397,8 +7271,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 312
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[311] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[71] -= amp_sv[0];
 
@@ -6410,8 +7287,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 313
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[312] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[59] -= amp_sv[0];
 
@@ -6423,8 +7303,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 314
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[313] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[69] -= amp_sv[0];
 
@@ -6436,8 +7319,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 315
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[314] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[57] -= amp_sv[0];
 
@@ -6449,8 +7335,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 316
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[315] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[63] -= amp_sv[0];
 
@@ -6462,8 +7351,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 317
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[316] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6476,8 +7368,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 318
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[317] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[57] += amp_sv[0];
       jamp_sv[59] -= amp_sv[0];
@@ -6492,8 +7387,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 319
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[318] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6506,8 +7404,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 320
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[319] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[89] -= amp_sv[0];
 
@@ -6519,8 +7420,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 321
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[320] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[95] -= amp_sv[0];
 
@@ -6532,8 +7436,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 322
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[321] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[83] -= amp_sv[0];
 
@@ -6545,8 +7452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 323
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[322] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[93] -= amp_sv[0];
 
@@ -6558,8 +7468,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 324
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[323] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[81] -= amp_sv[0];
 
@@ -6571,8 +7484,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 325
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[324] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[87] -= amp_sv[0];
 
@@ -6584,8 +7500,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 326
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[325] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6598,8 +7517,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 327
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[326] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[81] += amp_sv[0];
       jamp_sv[83] -= amp_sv[0];
@@ -6614,8 +7536,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 328
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[327] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6628,8 +7553,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 329
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[328] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[113] -= amp_sv[0];
 
@@ -6641,8 +7569,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 330
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[329] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[119] -= amp_sv[0];
 
@@ -6654,8 +7585,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 331
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[330] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[107] -= amp_sv[0];
 
@@ -6667,8 +7601,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 332
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[331] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[117] -= amp_sv[0];
 
@@ -6680,8 +7617,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 333
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[332] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[105] -= amp_sv[0];
 
@@ -6693,8 +7633,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 334
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[333] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[111] -= amp_sv[0];
 
@@ -6706,8 +7649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 335
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[334] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6720,8 +7666,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 336
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[335] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
@@ -6736,8 +7685,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 337
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[336] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6750,8 +7702,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 338
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[337] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6764,8 +7719,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 339
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[338] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[65] += amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
@@ -6780,8 +7738,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 340
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[339] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6800,8 +7761,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 341
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[340] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6854,8 +7818,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 343
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[342] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
@@ -6870,8 +7837,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 344
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[343] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6884,8 +7854,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 345
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[344] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6898,8 +7871,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 346
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[345] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[71] += amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
@@ -6914,8 +7890,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 347
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[346] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6934,8 +7913,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 348
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[347] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6988,8 +7970,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 350
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[349] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[35] += amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
@@ -7004,8 +7989,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 351
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[350] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7018,8 +8006,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 352
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[351] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7032,8 +8023,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 353
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[352] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[65] += amp_sv[0];
       jamp_sv[71] -= amp_sv[0];
@@ -7048,8 +8042,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 354
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[353] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7068,8 +8065,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 355
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[354] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7122,8 +8122,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 357
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[356] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[41] += amp_sv[0];
       jamp_sv[47] -= amp_sv[0];
@@ -7138,8 +8141,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 358
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[357] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7443,8 +8449,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 370
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[369] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7457,8 +8466,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 371
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[370] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7472,8 +8484,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 372
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[371] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7492,8 +8507,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 373
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[372] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += amp_sv[0];
       jamp_sv[44] -= amp_sv[0];
@@ -7508,8 +8526,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 374
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[373] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7528,8 +8549,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 375
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[374] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += amp_sv[0];
       jamp_sv[38] -= amp_sv[0];
@@ -7580,8 +8604,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 377
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[376] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7594,8 +8621,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 378
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[377] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7608,8 +8638,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 379
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[378] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -7624,8 +8657,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 380
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[379] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7638,8 +8674,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 381
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[380] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7652,8 +8691,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 382
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[381] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -7668,8 +8710,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 383
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[382] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -7684,8 +8729,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 384
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[383] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
@@ -7700,8 +8748,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 385
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[384] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7720,8 +8771,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 386
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[385] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7734,8 +8788,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 387
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[386] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7748,8 +8805,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 388
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[387] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7768,8 +8828,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 389
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[388] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
@@ -7784,8 +8847,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 390
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[389] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7804,8 +8870,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 391
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[390] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[53] -= amp_sv[0];
@@ -7854,8 +8923,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 393
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[392] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7868,8 +8940,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 394
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[393] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7882,8 +8957,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 395
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[394] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] += amp_sv[0];
       jamp_sv[85] -= amp_sv[0];
@@ -7898,8 +8976,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 396
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[395] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7912,8 +8993,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 397
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[396] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7926,8 +9010,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 398
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[397] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
@@ -7942,8 +9029,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 399
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[398] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[91] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
@@ -7958,8 +9048,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 400
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[399] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -7974,8 +9067,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 401
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[400] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7994,8 +9090,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 402
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[401] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -8010,8 +9109,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 403
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[402] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8030,8 +9132,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 404
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[403] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
@@ -8046,8 +9151,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 405
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[404] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8066,8 +9174,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 406
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[405] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8086,8 +9197,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 407
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[406] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8164,8 +9278,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 409
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[408] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -8192,8 +9309,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 410
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[409] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -8220,8 +9340,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 411
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[410] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
@@ -8248,8 +9371,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 412
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[411] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8268,8 +9394,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 413
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[412] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
@@ -8284,8 +9413,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 414
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[413] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
@@ -8300,8 +9432,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 415
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[414] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8320,8 +9455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 416
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[415] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -8336,8 +9474,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 417
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[416] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += amp_sv[0];
       jamp_sv[38] -= amp_sv[0];
@@ -8352,8 +9493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 418
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[417] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
@@ -8368,8 +9512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 419
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[418] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8388,8 +9535,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 420
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[419] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[46] += amp_sv[0];
       jamp_sv[47] -= amp_sv[0];
@@ -8404,8 +9554,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 421
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[420] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8424,8 +9577,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 422
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[421] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8444,8 +9600,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 423
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[422] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8522,8 +9681,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 425
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[424] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -8550,8 +9712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 426
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[425] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -8578,8 +9743,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 427
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[426] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -8606,8 +9774,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 428
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[427] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8626,8 +9797,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 429
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[428] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] += amp_sv[0];
       jamp_sv[75] -= amp_sv[0];
@@ -8642,8 +9816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 430
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[429] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[82] += amp_sv[0];
       jamp_sv[83] -= amp_sv[0];
@@ -8658,8 +9835,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 431
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[430] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -8678,8 +9858,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 432
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[431] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -8694,8 +9877,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 433
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[432] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += amp_sv[0];
       jamp_sv[44] -= amp_sv[0];
@@ -8710,8 +9896,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 434
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[433] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -8738,8 +9927,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 435
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[434] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -8824,8 +10016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 437
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[436] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -8852,8 +10047,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 438
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[437] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -8938,8 +10136,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 440
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[439] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -8966,8 +10167,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 441
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[440] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -9288,8 +10492,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 447
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[446] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -9316,8 +10523,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 448
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[447] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -9344,8 +10554,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 449
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[448] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
@@ -9372,8 +10585,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 450
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[449] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9392,8 +10608,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 451
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[450] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[91] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
@@ -9408,8 +10627,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 452
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[451] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9422,8 +10644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 453
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[452] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9436,8 +10661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 454
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[453] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[78] += amp_sv[0];
       jamp_sv[80] -= amp_sv[0];
@@ -9452,8 +10680,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 455
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[454] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9506,8 +10737,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 457
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[456] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] += amp_sv[0];
       jamp_sv[78] -= amp_sv[0];
@@ -9522,8 +10756,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 458
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[457] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9536,8 +10773,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 459
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[458] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9550,8 +10790,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 460
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[459] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9570,8 +10813,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 461
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[460] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[115] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
@@ -9586,8 +10832,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 462
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[461] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9600,8 +10849,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 463
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[462] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9614,8 +10866,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 464
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[463] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
@@ -9630,8 +10885,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 465
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[464] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9684,8 +10942,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 467
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[466] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
@@ -9700,8 +10961,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 468
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[467] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9714,8 +10978,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 469
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[468] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9728,8 +10995,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 470
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[469] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9748,8 +11018,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 471
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[470] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -9764,8 +11037,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 472
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[471] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9778,8 +11054,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 473
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[472] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9792,8 +11071,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 474
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[473] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[52] -= amp_sv[0];
@@ -9808,8 +11090,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 475
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[474] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9862,8 +11147,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 477
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[476] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9882,8 +11170,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 478
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[477] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -9898,8 +11189,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 479
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[478] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9912,8 +11206,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 480
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[479] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -9926,8 +11223,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 481
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[480] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[50] -= amp_sv[0];
@@ -9942,8 +11242,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 482
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[481] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9996,8 +11299,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 484
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[483] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10016,8 +11322,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 485
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[484] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10036,8 +11345,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 486
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[485] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10056,8 +11368,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 487
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[486] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -10072,8 +11387,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 488
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[487] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10092,8 +11410,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 489
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[488] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[92] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
@@ -10235,8 +11556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 493
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[492] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10249,8 +11573,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 494
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[493] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10263,8 +11590,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 495
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[494] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10283,8 +11613,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 496
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[495] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[43] += amp_sv[0];
       jamp_sv[44] -= amp_sv[0];
@@ -10299,8 +11632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 497
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[496] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10319,8 +11655,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 498
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[497] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
@@ -10371,8 +11710,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 500
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[499] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10385,8 +11727,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 501
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[500] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10399,8 +11744,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 502
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[501] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -10415,8 +11763,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 503
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[502] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10429,8 +11780,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 504
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[503] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10443,8 +11797,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 505
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[504] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -10459,8 +11816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 506
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[505] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -10475,8 +11835,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 507
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[506] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += amp_sv[0];
       jamp_sv[35] -= amp_sv[0];
@@ -10491,8 +11854,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 508
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[507] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10511,8 +11877,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 509
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[508] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10525,8 +11894,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 510
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[509] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10539,8 +11911,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 511
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[510] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10559,8 +11934,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 512
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[511] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
@@ -10575,8 +11953,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 513
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[512] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10595,8 +11976,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 514
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[513] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += amp_sv[0];
       jamp_sv[77] -= amp_sv[0];
@@ -10645,8 +12029,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 516
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[515] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10659,8 +12046,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 517
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[516] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10673,8 +12063,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 518
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[517] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
@@ -10689,8 +12082,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 519
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[518] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10703,8 +12099,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 520
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[519] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10717,8 +12116,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 521
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[520] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[101] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
@@ -10733,8 +12135,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 522
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[521] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[67] += amp_sv[0];
       jamp_sv[70] -= amp_sv[0];
@@ -10749,8 +12154,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 523
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[522] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -10765,8 +12173,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 524
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[523] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10785,8 +12196,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 525
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[524] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -10801,8 +12215,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 526
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[525] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10821,8 +12238,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 527
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[526] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += amp_sv[0];
       jamp_sv[35] -= amp_sv[0];
@@ -10837,8 +12257,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 528
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[527] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10857,8 +12280,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 529
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[528] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10877,8 +12303,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 530
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[529] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10955,8 +12384,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 532
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[531] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -10983,8 +12415,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 533
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[532] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -11011,8 +12446,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 534
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[533] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -11039,8 +12477,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 535
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[534] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11059,8 +12500,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 536
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[535] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
@@ -11075,8 +12519,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 537
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[536] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
@@ -11091,8 +12538,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 538
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[537] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11111,8 +12561,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 539
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[538] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -11127,8 +12580,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 540
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[539] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
@@ -11143,8 +12599,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 541
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[540] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -11159,8 +12618,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 542
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[541] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11179,8 +12641,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 543
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[542] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[44] += amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
@@ -11195,8 +12660,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 544
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[543] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11215,8 +12683,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 545
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[544] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11235,8 +12706,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 546
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[545] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11313,8 +12787,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 548
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[547] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -11341,8 +12818,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 549
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[548] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -11369,8 +12849,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 550
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[549] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -11397,8 +12880,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 551
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[550] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11417,8 +12903,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 552
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[551] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] += amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
@@ -11433,8 +12922,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 553
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[552] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[58] += amp_sv[0];
       jamp_sv[59] -= amp_sv[0];
@@ -11449,8 +12941,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 554
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[553] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
@@ -11469,8 +12964,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 555
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[554] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -11485,8 +12983,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 556
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[555] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[43] += amp_sv[0];
       jamp_sv[46] -= amp_sv[0];
@@ -11501,8 +13002,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 557
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[556] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -11529,8 +13033,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 558
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[557] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -11615,8 +13122,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 560
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[559] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -11643,8 +13153,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 561
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[560] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -11729,8 +13242,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 563
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[562] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -11757,8 +13273,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 564
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[563] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -12079,8 +13598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 570
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[569] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -12107,8 +13629,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 571
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[570] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -12135,8 +13660,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 572
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[571] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -12163,8 +13691,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 573
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[572] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12183,8 +13714,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 574
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[573] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[67] += amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
@@ -12199,8 +13733,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 575
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[574] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12213,8 +13750,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 576
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[575] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12227,8 +13767,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 577
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[576] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[54] += amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
@@ -12243,8 +13786,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 578
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[577] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12297,8 +13843,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 580
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[579] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] += amp_sv[0];
       jamp_sv[54] -= amp_sv[0];
@@ -12313,8 +13862,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 581
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[580] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12327,8 +13879,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 582
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[581] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12341,8 +13896,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 583
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[582] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12361,8 +13919,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 584
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[583] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[109] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
@@ -12377,8 +13938,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 585
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[584] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12391,8 +13955,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 586
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[585] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12405,8 +13972,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 587
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[586] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[103] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
@@ -12421,8 +13991,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 588
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[587] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12475,8 +14048,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 590
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[589] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
@@ -12491,8 +14067,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 591
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[590] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12505,8 +14084,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 592
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[591] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12519,8 +14101,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 593
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[592] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12539,8 +14124,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 594
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[593] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -12555,8 +14143,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 595
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[594] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12569,8 +14160,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 596
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[595] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12583,8 +14177,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 597
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[596] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[76] -= amp_sv[0];
@@ -12599,8 +14196,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 598
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[597] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12653,8 +14253,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 600
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[599] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12673,8 +14276,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 601
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[600] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -12689,8 +14295,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 602
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[601] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12703,8 +14312,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 603
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[602] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12717,8 +14329,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 604
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[603] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += amp_sv[0];
       jamp_sv[50] -= amp_sv[0];
@@ -12733,8 +14348,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 605
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[604] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12787,8 +14405,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 607
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[606] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12807,8 +14428,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 608
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[607] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12827,8 +14451,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 609
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[608] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12847,8 +14474,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 610
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[609] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -12863,8 +14493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 611
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[610] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12883,8 +14516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 612
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[611] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[68] += amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
@@ -13026,8 +14662,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 616
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[615] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13040,8 +14679,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 617
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[616] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13054,8 +14696,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 618
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[617] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13074,8 +14719,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 619
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[618] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[37] += amp_sv[0];
       jamp_sv[38] -= amp_sv[0];
@@ -13090,8 +14738,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 620
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[619] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13110,8 +14761,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 621
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[620] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[31] += amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
@@ -13162,8 +14816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 623
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[622] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13176,8 +14833,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 624
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[623] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13190,8 +14850,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 625
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[624] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[29] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -13206,8 +14869,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 626
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[625] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13220,8 +14886,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 627
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[626] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13234,8 +14903,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 628
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[627] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -13250,8 +14922,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 629
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[628] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
@@ -13266,8 +14941,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 630
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[629] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += amp_sv[0];
       jamp_sv[33] -= amp_sv[0];
@@ -13282,8 +14960,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 631
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[630] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13302,8 +14983,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 632
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[631] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13316,8 +15000,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 633
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[632] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13330,8 +15017,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 634
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[633] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13350,8 +15040,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 635
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[634] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += amp_sv[0];
       jamp_sv[53] -= amp_sv[0];
@@ -13366,8 +15059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 636
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[635] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13386,8 +15082,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 637
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[636] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] += amp_sv[0];
       jamp_sv[77] -= amp_sv[0];
@@ -13436,8 +15135,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 639
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[638] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13450,8 +15152,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 640
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[639] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13464,8 +15169,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 641
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[640] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[53] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
@@ -13480,8 +15188,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 642
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[641] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13494,8 +15205,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 643
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[642] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -13508,8 +15222,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 644
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[643] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[77] += amp_sv[0];
       jamp_sv[85] -= amp_sv[0];
@@ -13524,8 +15241,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 645
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[644] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[61] += amp_sv[0];
       jamp_sv[64] -= amp_sv[0];
@@ -13540,8 +15260,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 646
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[645] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
@@ -13556,8 +15279,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 647
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[646] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13576,8 +15302,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 648
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[647] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
@@ -13592,8 +15321,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 649
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[648] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13612,8 +15344,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 650
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[649] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += amp_sv[0];
       jamp_sv[33] -= amp_sv[0];
@@ -13628,8 +15363,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 651
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[650] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13648,8 +15386,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 652
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[651] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13668,8 +15409,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 653
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[652] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13746,8 +15490,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 655
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[654] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -13774,8 +15521,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 656
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[655] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -13802,8 +15552,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 657
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[656] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
@@ -13830,8 +15583,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 658
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[657] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13850,8 +15606,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 659
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[658] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] += amp_sv[0];
       jamp_sv[77] -= amp_sv[0];
@@ -13866,8 +15625,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 660
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[659] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[80] += amp_sv[0];
       jamp_sv[81] -= amp_sv[0];
@@ -13882,8 +15644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 661
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[660] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13902,8 +15667,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 662
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[661] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -13918,8 +15686,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 663
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[662] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[31] += amp_sv[0];
       jamp_sv[34] -= amp_sv[0];
@@ -13934,8 +15705,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 664
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[663] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -13950,8 +15724,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 665
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[664] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13970,8 +15747,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 666
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[665] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[38] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
@@ -13986,8 +15766,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 667
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[666] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14006,8 +15789,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 668
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[667] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14026,8 +15812,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 669
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[668] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14104,8 +15893,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 671
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[670] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -14132,8 +15924,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 672
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[671] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -14160,8 +15955,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 673
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[672] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -14188,8 +15986,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 674
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[673] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14208,8 +16009,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 675
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[674] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] += amp_sv[0];
       jamp_sv[53] -= amp_sv[0];
@@ -14224,8 +16028,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 676
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[675] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[56] += amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
@@ -14240,8 +16047,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 677
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[676] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14260,8 +16070,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 678
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[677] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -14276,8 +16089,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 679
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[678] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[37] += amp_sv[0];
       jamp_sv[40] -= amp_sv[0];
@@ -14292,8 +16108,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 680
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[679] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
@@ -14320,8 +16139,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 681
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[680] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -14406,8 +16228,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 683
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[682] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -14434,8 +16259,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 684
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[683] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -14520,8 +16348,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 686
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[685] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
@@ -14548,8 +16379,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 687
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[686] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
@@ -14870,8 +16704,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 693
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[692] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -14898,8 +16735,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 694
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[693] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -14926,8 +16766,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 695
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[694] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
@@ -14954,8 +16797,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 696
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[695] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -14974,8 +16820,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 697
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[696] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[61] += amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
@@ -14990,8 +16839,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 698
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[697] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15004,8 +16856,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 699
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[698] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15018,8 +16873,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 700
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[699] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[55] += amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
@@ -15034,8 +16892,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 701
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[700] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15088,8 +16949,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 703
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[702] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] += amp_sv[0];
       jamp_sv[55] -= amp_sv[0];
@@ -15104,8 +16968,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 704
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[703] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15118,8 +16985,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 705
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[704] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15132,8 +17002,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 706
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[705] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15152,8 +17025,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 707
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[706] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[85] += amp_sv[0];
       jamp_sv[86] -= amp_sv[0];
@@ -15168,8 +17044,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 708
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[707] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15182,8 +17061,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 709
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[708] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15196,8 +17078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 710
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[709] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[79] += amp_sv[0];
       jamp_sv[80] -= amp_sv[0];
@@ -15212,8 +17097,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 711
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[710] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15266,8 +17154,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 713
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[712] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] += amp_sv[0];
       jamp_sv[79] -= amp_sv[0];
@@ -15282,8 +17173,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 714
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[713] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15296,8 +17190,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 715
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[714] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15310,8 +17207,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 716
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[715] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15330,8 +17230,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 717
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[716] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
@@ -15346,8 +17249,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 718
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[717] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15360,8 +17266,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 719
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[718] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15374,8 +17283,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 720
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[719] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += amp_sv[0];
       jamp_sv[76] -= amp_sv[0];
@@ -15390,8 +17302,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 721
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[720] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15444,8 +17359,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 723
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[722] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15464,8 +17382,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 724
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[723] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -15480,8 +17401,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 725
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[724] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15494,8 +17418,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 726
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[725] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15508,8 +17435,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 727
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[726] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += amp_sv[0];
       jamp_sv[52] -= amp_sv[0];
@@ -15524,8 +17454,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 728
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[727] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15578,8 +17511,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 730
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[729] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15598,8 +17534,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 731
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[730] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15618,8 +17557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 732
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[731] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15638,8 +17580,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 733
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[732] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -15654,8 +17599,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 734
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[733] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15674,8 +17622,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 735
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[734] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[62] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
@@ -15816,8 +17767,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 739
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[738] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[29] -= amp_sv[0];
 
@@ -15829,8 +17783,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 740
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[739] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] -= amp_sv[0];
 
@@ -15842,8 +17799,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 741
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[740] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] -= amp_sv[0];
 
@@ -15855,8 +17815,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 742
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[741] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[46] -= amp_sv[0];
 
@@ -15868,8 +17831,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 743
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[742] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[37] -= amp_sv[0];
 
@@ -15881,8 +17847,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 744
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[743] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[43] -= amp_sv[0];
 
@@ -15894,8 +17863,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 745
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[744] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15908,8 +17880,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 746
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[745] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15922,8 +17897,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 747
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[746] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
@@ -15938,8 +17916,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 748
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[747] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] -= amp_sv[0];
 
@@ -15951,8 +17932,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 749
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[748] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] -= amp_sv[0];
 
@@ -15964,8 +17948,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 750
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[749] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] -= amp_sv[0];
 
@@ -15977,8 +17964,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 751
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[750] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[44] -= amp_sv[0];
 
@@ -15990,8 +17980,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 752
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[751] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[31] -= amp_sv[0];
 
@@ -16003,8 +17996,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 753
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[752] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] -= amp_sv[0];
 
@@ -16016,8 +18012,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 754
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[753] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16030,8 +18029,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 755
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[754] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16044,8 +18046,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 756
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[755] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -16060,8 +18065,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 757
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[756] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] -= amp_sv[0];
 
@@ -16073,8 +18081,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 758
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[757] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] -= amp_sv[0];
 
@@ -16086,8 +18097,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 759
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[758] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] -= amp_sv[0];
 
@@ -16099,8 +18113,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 760
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[759] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[38] -= amp_sv[0];
 
@@ -16112,8 +18129,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 761
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[760] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] -= amp_sv[0];
 
@@ -16125,8 +18145,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 762
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[761] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] -= amp_sv[0];
 
@@ -16138,8 +18161,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 763
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[762] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16152,8 +18178,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 764
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[763] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16166,8 +18195,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 765
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[764] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -16182,8 +18214,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 766
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[765] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16196,8 +18231,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 767
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[766] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -16212,8 +18250,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 768
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[767] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16232,8 +18273,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 769
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[768] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -16248,8 +18292,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 770
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[769] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16268,8 +18315,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 771
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[770] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16318,8 +18368,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 773
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[772] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16332,8 +18385,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 774
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[773] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -16348,8 +18404,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 775
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[774] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16368,8 +18427,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 776
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[775] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -16384,8 +18446,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 777
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[776] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16404,8 +18469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 778
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[777] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16454,8 +18522,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 780
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[779] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16468,8 +18539,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 781
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[780] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -16484,8 +18558,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 782
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[781] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16504,8 +18581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 783
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[782] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -16520,8 +18600,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 784
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[783] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16540,8 +18623,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 785
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[784] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16648,8 +18734,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 789
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[788] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] -= amp_sv[0];
 
@@ -16661,8 +18750,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 790
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[789] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[70] -= amp_sv[0];
 
@@ -16674,8 +18766,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 791
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[790] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[53] -= amp_sv[0];
 
@@ -16687,8 +18782,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 792
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[791] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] -= amp_sv[0];
 
@@ -16700,8 +18798,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 793
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[792] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[67] -= amp_sv[0];
 
@@ -16713,8 +18814,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 794
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[793] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[61] -= amp_sv[0];
 
@@ -16726,8 +18830,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 795
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[794] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16740,8 +18847,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 796
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[795] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16754,8 +18864,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 797
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[796] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] += amp_sv[0];
       jamp_sv[53] -= amp_sv[0];
@@ -16770,8 +18883,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 798
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[797] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[88] -= amp_sv[0];
 
@@ -16783,8 +18899,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 799
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[798] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[94] -= amp_sv[0];
 
@@ -16796,8 +18915,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 800
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[799] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[77] -= amp_sv[0];
 
@@ -16809,8 +18931,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 801
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[800] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] -= amp_sv[0];
 
@@ -16822,8 +18947,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 802
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[801] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[91] -= amp_sv[0];
 
@@ -16835,8 +18963,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 803
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[802] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[85] -= amp_sv[0];
 
@@ -16848,8 +18979,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 804
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[803] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16862,8 +18996,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 805
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[804] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16876,8 +19013,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 806
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[805] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] += amp_sv[0];
       jamp_sv[77] -= amp_sv[0];
@@ -16892,8 +19032,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 807
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[806] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[112] -= amp_sv[0];
 
@@ -16905,8 +19048,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 808
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[807] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[118] -= amp_sv[0];
 
@@ -16918,8 +19064,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 809
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[808] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[101] -= amp_sv[0];
 
@@ -16931,8 +19080,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 810
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[809] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] -= amp_sv[0];
 
@@ -16944,8 +19096,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 811
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[810] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[115] -= amp_sv[0];
 
@@ -16957,8 +19112,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 812
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[811] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[109] -= amp_sv[0];
 
@@ -16970,8 +19128,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 813
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[812] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16984,8 +19145,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 814
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[813] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16998,8 +19162,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 815
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[814] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
@@ -17014,8 +19181,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 816
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[815] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17028,8 +19198,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 817
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[816] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += amp_sv[0];
       jamp_sv[88] -= amp_sv[0];
@@ -17044,8 +19217,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 818
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[817] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17064,8 +19240,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 819
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[818] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -17080,8 +19259,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 820
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[819] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17100,8 +19282,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 821
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[820] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17148,8 +19333,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 823
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[822] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17162,8 +19350,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 824
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[823] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[70] += amp_sv[0];
       jamp_sv[88] -= amp_sv[0];
@@ -17178,8 +19369,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 825
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[824] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17198,8 +19392,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 826
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[825] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -17214,8 +19411,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 827
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[826] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17234,8 +19434,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 828
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[827] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17282,8 +19485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 830
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[829] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17296,8 +19502,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 831
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[830] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[64] += amp_sv[0];
       jamp_sv[70] -= amp_sv[0];
@@ -17312,8 +19521,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 832
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[831] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17332,8 +19544,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 833
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[832] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
@@ -17348,8 +19563,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 834
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[833] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17368,8 +19586,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 835
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[834] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17472,8 +19693,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 839
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[838] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
@@ -17500,8 +19724,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 840
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[839] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
@@ -17586,8 +19813,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 842
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[841] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -17614,8 +19844,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 843
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[842] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -17700,8 +19933,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 845
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[844] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -17728,8 +19964,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 846
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[845] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -18054,8 +20293,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 852
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[851] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -18082,8 +20324,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 853
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[852] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -18110,8 +20355,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 854
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[853] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -18138,8 +20386,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 855
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[854] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18158,8 +20409,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 856
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[855] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
@@ -18174,8 +20428,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 857
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[856] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18188,8 +20445,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 858
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[857] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += amp_sv[0];
       jamp_sv[74] -= amp_sv[0];
@@ -18204,8 +20464,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 859
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[858] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18218,8 +20481,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 860
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[859] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18272,8 +20538,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 862
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[861] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += amp_sv[0];
       jamp_sv[74] -= amp_sv[0];
@@ -18288,8 +20557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 863
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[862] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18302,8 +20574,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 864
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[863] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18316,8 +20591,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 865
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[864] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18336,8 +20614,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 866
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[865] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
@@ -18352,8 +20633,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 867
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[866] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18366,8 +20650,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 868
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[867] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
@@ -18382,8 +20669,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 869
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[868] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18396,8 +20686,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 870
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[869] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18450,8 +20743,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 872
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[871] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
@@ -18466,8 +20762,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 873
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[872] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18480,8 +20779,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 874
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[873] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18494,8 +20796,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 875
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[874] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18514,8 +20819,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 876
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[875] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -18530,8 +20838,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 877
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[876] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18544,8 +20855,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 878
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[877] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += amp_sv[0];
       jamp_sv[58] -= amp_sv[0];
@@ -18560,8 +20874,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 879
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[878] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18574,8 +20891,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 880
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[879] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18628,8 +20948,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 882
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[881] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18648,8 +20971,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 883
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[882] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -18664,8 +20990,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 884
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[883] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18678,8 +21007,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 885
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[884] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
@@ -18694,8 +21026,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 886
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[885] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18708,8 +21043,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 887
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[886] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18762,8 +21100,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 889
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[888] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18782,8 +21123,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 890
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[889] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18802,8 +21146,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 891
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[890] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18822,8 +21169,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 892
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[891] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18842,8 +21192,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 893
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[892] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[30] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -18858,8 +21211,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 894
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[893] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
@@ -18874,8 +21230,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 895
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[894] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
@@ -18902,8 +21261,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 896
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[895] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -18988,8 +21350,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 898
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[897] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -19016,8 +21381,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 899
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[898] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -19102,8 +21470,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 901
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[900] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -19130,8 +21501,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 902
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[901] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -19454,8 +21828,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 908
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[907] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -19482,8 +21859,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 909
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[908] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -19510,8 +21890,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 910
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[909] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
@@ -19538,8 +21921,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 911
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[910] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19558,8 +21944,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 912
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[911] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
@@ -19574,8 +21963,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 913
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[912] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19588,8 +21980,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 914
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[913] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += amp_sv[0];
       jamp_sv[50] -= amp_sv[0];
@@ -19604,8 +21999,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 915
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[914] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19618,8 +22016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 916
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[915] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
@@ -19672,8 +22073,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 918
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[917] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += amp_sv[0];
       jamp_sv[50] -= amp_sv[0];
@@ -19688,8 +22092,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 919
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[918] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19702,8 +22109,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 920
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[919] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19716,8 +22126,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 921
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[920] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19736,8 +22149,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 922
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[921] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[108] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
@@ -19752,8 +22168,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 923
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[922] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19766,8 +22185,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 924
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[923] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
@@ -19782,8 +22204,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 925
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[924] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19796,8 +22221,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 926
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[925] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
@@ -19850,8 +22278,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 928
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[927] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
@@ -19866,8 +22297,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 929
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[928] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19880,8 +22314,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 930
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[929] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19894,8 +22331,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 931
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[930] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19914,8 +22354,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 932
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[931] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -19930,8 +22373,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 933
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[932] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19944,8 +22390,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 934
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[933] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += amp_sv[0];
       jamp_sv[82] -= amp_sv[0];
@@ -19960,8 +22409,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 935
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[934] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19974,8 +22426,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 936
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[935] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
@@ -20028,8 +22483,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 938
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[937] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20048,8 +22506,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 939
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[938] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -20064,8 +22525,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 940
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[939] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20078,8 +22542,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 941
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[940] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[38] += amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
@@ -20094,8 +22561,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 942
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[941] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20108,8 +22578,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 943
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[942] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -20162,8 +22635,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 945
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[944] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
@@ -20182,8 +22658,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 946
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[945] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -20202,8 +22681,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 947
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[946] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -20222,8 +22704,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 948
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[947] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
@@ -20242,8 +22727,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 949
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[948] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[36] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -20258,8 +22746,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 950
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[949] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
@@ -20274,8 +22765,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 951
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[950] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
@@ -20302,8 +22796,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 952
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[951] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -20388,8 +22885,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 954
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[953] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -20416,8 +22916,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 955
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[954] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -20502,8 +23005,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 957
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[956] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -20530,8 +23036,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 958
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[957] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -20852,8 +23361,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 964
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[963] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -20880,8 +23392,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 965
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[964] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -20908,8 +23423,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 966
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[965] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -20936,8 +23454,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 967
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[966] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20956,8 +23477,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 968
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[967] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
@@ -20972,8 +23496,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 969
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[968] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20986,8 +23513,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 970
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[969] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += amp_sv[0];
       jamp_sv[50] -= amp_sv[0];
@@ -21002,8 +23532,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 971
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[970] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21016,8 +23549,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 972
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[971] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21070,8 +23606,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 974
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[973] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += amp_sv[0];
       jamp_sv[52] -= amp_sv[0];
@@ -21086,8 +23625,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 975
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[974] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21100,8 +23642,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 976
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[975] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21114,8 +23659,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 977
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[976] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21134,8 +23682,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 978
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[977] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[84] += amp_sv[0];
       jamp_sv[85] -= amp_sv[0];
@@ -21150,8 +23701,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 979
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[978] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21164,8 +23718,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 980
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[979] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += amp_sv[0];
       jamp_sv[74] -= amp_sv[0];
@@ -21180,8 +23737,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 981
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[980] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21194,8 +23754,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 982
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[981] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21248,8 +23811,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 984
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[983] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += amp_sv[0];
       jamp_sv[76] -= amp_sv[0];
@@ -21264,8 +23830,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 985
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[984] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21278,8 +23847,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 986
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[985] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21292,8 +23864,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 987
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[986] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21312,8 +23887,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 988
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[987] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -21328,8 +23906,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 989
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[988] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21342,8 +23923,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 990
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[989] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[46] += amp_sv[0];
       jamp_sv[82] -= amp_sv[0];
@@ -21358,8 +23942,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 991
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[990] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21372,8 +23959,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 992
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[991] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21426,8 +24016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 994
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[993] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21446,8 +24039,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 995
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[994] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
@@ -21462,8 +24058,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 996
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[995] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21476,8 +24075,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 997
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[996] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[44] += amp_sv[0];
       jamp_sv[58] -= amp_sv[0];
@@ -21492,8 +24094,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 998
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[997] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21506,8 +24111,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 999
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[998] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21560,8 +24168,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1001
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1000] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21580,8 +24191,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1002
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1001] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21600,8 +24214,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1003
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1002] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21620,8 +24237,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1004
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1003] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
@@ -21640,8 +24260,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1005
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1004] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[42] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -21656,8 +24279,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1006
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1005] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
@@ -21672,8 +24298,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1007
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1006] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -21700,8 +24329,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1008
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1007] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -21786,8 +24418,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1010
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1009] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -21814,8 +24449,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1011
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1010] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -21900,8 +24538,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1013
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1012] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -21928,8 +24569,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1014
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1013] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -22192,8 +24836,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1019
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1018] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -22220,8 +24867,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1020
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1019] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -22306,8 +24956,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1022
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1021] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -22334,8 +24987,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1023
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1022] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -22420,8 +25076,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1025
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1024] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -22448,8 +25107,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1026
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1025] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -22710,8 +25372,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1031
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1030] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -22738,8 +25403,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1032
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1031] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -22824,8 +25492,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1034
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1033] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
@@ -22852,8 +25523,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1035
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1034] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -22938,8 +25612,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1037
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1036] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -22966,8 +25643,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1038
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1037] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -23504,8 +26184,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1046
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1045] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[52] -= amp_sv[0];
 
@@ -23517,8 +26200,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1047
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1046] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[49] -= amp_sv[0];
 
@@ -23530,8 +26216,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1048
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1047] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[58] -= amp_sv[0];
 
@@ -23543,8 +26232,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1049
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1048] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[68] -= amp_sv[0];
 
@@ -23556,8 +26248,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1050
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1049] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[55] -= amp_sv[0];
 
@@ -23569,8 +26264,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1051
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1050] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] -= amp_sv[0];
 
@@ -23582,8 +26280,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1052
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1051] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[50] -= amp_sv[0];
 
@@ -23595,8 +26296,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1053
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1052] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] -= amp_sv[0];
 
@@ -23608,8 +26312,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1054
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1053] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[56] -= amp_sv[0];
 
@@ -23621,8 +26328,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1055
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1054] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[62] -= amp_sv[0];
 
@@ -23634,8 +26344,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1056
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1055] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[54] -= amp_sv[0];
 
@@ -23647,8 +26360,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1057
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1056] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] -= amp_sv[0];
 
@@ -23660,8 +26376,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1058
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1057] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += amp_sv[0];
       jamp_sv[49] -= amp_sv[0];
@@ -23676,8 +26395,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1059
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1058] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23690,8 +26412,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1060
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1059] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[54] += amp_sv[0];
       jamp_sv[55] -= amp_sv[0];
@@ -23706,8 +26431,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1061
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1060] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
@@ -23726,8 +26454,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1062
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1061] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23740,8 +26471,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1063
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1062] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
@@ -23794,8 +26528,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1065
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1064] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[76] -= amp_sv[0];
 
@@ -23807,8 +26544,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1066
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1065] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[73] -= amp_sv[0];
 
@@ -23820,8 +26560,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1067
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1066] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[82] -= amp_sv[0];
 
@@ -23833,8 +26576,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1068
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1067] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[92] -= amp_sv[0];
 
@@ -23846,8 +26592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1069
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1068] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[79] -= amp_sv[0];
 
@@ -23859,8 +26608,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1070
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1069] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] -= amp_sv[0];
 
@@ -23872,8 +26624,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1071
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1070] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[74] -= amp_sv[0];
 
@@ -23885,8 +26640,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1072
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1071] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] -= amp_sv[0];
 
@@ -23898,8 +26656,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1073
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1072] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[80] -= amp_sv[0];
 
@@ -23911,8 +26672,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1074
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1073] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[86] -= amp_sv[0];
 
@@ -23924,8 +26688,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1075
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1074] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[78] -= amp_sv[0];
 
@@ -23937,8 +26704,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1076
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1075] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[84] -= amp_sv[0];
 
@@ -23950,8 +26720,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1077
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1076] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += amp_sv[0];
       jamp_sv[73] -= amp_sv[0];
@@ -23966,8 +26739,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1078
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1077] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23980,8 +26756,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1079
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1078] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[78] += amp_sv[0];
       jamp_sv[79] -= amp_sv[0];
@@ -23996,8 +26775,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1080
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1079] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24016,8 +26798,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1081
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1080] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24030,8 +26815,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1082
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1081] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24084,8 +26872,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1084
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1083] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[100] -= amp_sv[0];
 
@@ -24097,8 +26888,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1085
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1084] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[97] -= amp_sv[0];
 
@@ -24110,8 +26904,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1086
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1085] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[106] -= amp_sv[0];
 
@@ -24123,8 +26920,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1087
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1086] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[116] -= amp_sv[0];
 
@@ -24136,8 +26936,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1088
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1087] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[103] -= amp_sv[0];
 
@@ -24149,8 +26952,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1089
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1088] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[114] -= amp_sv[0];
 
@@ -24162,8 +26968,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1090
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1089] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[98] -= amp_sv[0];
 
@@ -24175,8 +26984,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1091
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1090] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] -= amp_sv[0];
 
@@ -24188,8 +27000,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1092
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1091] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[104] -= amp_sv[0];
 
@@ -24201,8 +27016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1093
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1092] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[110] -= amp_sv[0];
 
@@ -24214,8 +27032,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1094
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1093] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[102] -= amp_sv[0];
 
@@ -24227,8 +27048,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1095
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1094] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[108] -= amp_sv[0];
 
@@ -24240,8 +27064,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1096
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1095] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
@@ -24256,8 +27083,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1097
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1096] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24270,8 +27100,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1098
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1097] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
@@ -24286,8 +27119,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1099
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1098] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24306,8 +27142,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1100
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1099] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24320,8 +27159,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1101
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1100] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24374,8 +27216,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1103
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1102] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[40] += amp_sv[0];
       jamp_sv[46] -= amp_sv[0];
@@ -24390,8 +27235,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1104
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1103] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24404,8 +27252,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1105
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1104] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -24420,8 +27271,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1106
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1105] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24440,8 +27294,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1107
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1106] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24454,8 +27311,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1108
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1107] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24508,8 +27368,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1110
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1109] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[34] += amp_sv[0];
       jamp_sv[44] -= amp_sv[0];
@@ -24524,8 +27387,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1111
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1110] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24538,8 +27404,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1112
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1111] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -24554,8 +27423,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1113
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1112] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24574,8 +27446,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1114
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1113] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24588,8 +27463,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1115
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1114] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24642,8 +27520,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1117
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1116] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[32] += amp_sv[0];
       jamp_sv[38] -= amp_sv[0];
@@ -24658,8 +27539,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1118
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1117] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24672,8 +27556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1119
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1118] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -24688,8 +27575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1120
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1119] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24708,8 +27598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1121
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1120] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24722,8 +27615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1122
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1121] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
@@ -30448,9 +33344,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -30526,8 +33421,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -30589,25 +33483,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -30652,16 +33556,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -30729,6 +33661,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -30738,6 +33671,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -30749,8 +33684,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -30778,7 +33715,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -30794,7 +33730,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -30808,11 +33744,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -30824,6 +33763,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -30832,9 +33772,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -30845,9 +33786,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -30885,40 +33832,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -30933,7 +33846,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -30981,82 +33895,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -31081,13 +34010,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -31095,13 +34018,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index 201a432a8a..ee2421cf9a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h
new file mode 100644
index 0000000000..fe7af482a7
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 1240;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 4f7b5172f1..1240605a6d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +58,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004235267639160156 [0m
+[1;32mDEBUG: model prefixing  takes 0.003571033477783203 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.490 s
+1 processes with 1240 diagrams generated in 0.709 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 5.122 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 2.512 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.290 s
+ALOHA: aloha creates 5 routines in  0.157 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in  0.290 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m10.012s
-user	0m9.867s
-sys	0m0.109s
-Code generation completed in 10 seconds
+real	0m7.550s
+user	0m5.121s
+sys	0m0.217s
+Code generation completed in 8 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index 2f17add993..12c45ededb 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -32338,9 +32399,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -32416,8 +32476,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -32479,25 +32538,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -32542,16 +32611,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -32619,6 +32716,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -32628,6 +32726,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -32639,8 +32739,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -32668,7 +32770,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -32684,7 +32785,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -32698,11 +32799,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -32714,6 +32818,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -32722,9 +32827,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -32735,9 +32841,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -32775,40 +32887,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -32823,7 +32901,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -32871,82 +32950,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -32971,13 +33065,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -32985,13 +33073,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index 201a432a8a..ee2421cf9a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h
new file mode 100644
index 0000000000..fe7af482a7
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 1240;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 71b7095c67..f2e018386d 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -56,7 +57,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004422187805175781 [0m
+[1;32mDEBUG: model prefixing  takes 0.0024957656860351562 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -165,21 +166,21 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.058 s
+8 processes with 40 diagrams generated in 0.069 s
 Total: 8 processes with 40 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -199,9 +200,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -210,50 +211,52 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s
-Wrote files for 32 helas calls in 0.131 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.049 s
+Wrote files for 32 helas calls in 3.302 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.106 s
+ALOHA: aloha creates 2 routines in  0.104 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.094 s
+ALOHA: aloha creates 4 routines in  0.087 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.314s
-user	0m1.828s
-sys	0m0.404s
-Code generation completed in 2 seconds
+real	0m10.462s
+user	0m1.475s
+sys	0m0.703s
+Code generation completed in 10 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -274,9 +277,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -303,9 +306,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 7d4745918b..5091f85849 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -397,8 +458,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -426,8 +493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -440,8 +510,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -454,8 +527,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -787,9 +863,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -865,8 +940,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -928,25 +1002,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -991,16 +1075,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1068,6 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1077,6 +1190,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1088,8 +1203,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1117,7 +1234,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1133,7 +1249,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1147,11 +1263,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1163,6 +1282,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1171,9 +1291,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1184,9 +1305,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1224,40 +1351,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1272,7 +1365,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1320,82 +1414,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1420,13 +1529,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1434,13 +1537,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index bd42537623..0bf2e4625f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h
new file mode 100644
index 0000000000..fe66e4e760
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H
+#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 414284b61a..9a64abfae3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -397,8 +458,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -426,8 +493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -440,8 +510,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -454,8 +527,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -787,9 +863,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -865,8 +940,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -928,25 +1002,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -991,16 +1075,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1068,6 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1077,6 +1190,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1088,8 +1203,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1117,7 +1234,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1133,7 +1249,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1147,11 +1263,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1163,6 +1282,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1171,9 +1291,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1184,9 +1305,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1224,40 +1351,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1272,7 +1365,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1320,82 +1414,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1420,13 +1529,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1434,13 +1537,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index dd4aae8a06..9191598e88 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h
new file mode 100644
index 0000000000..89823b9d1d
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H
+#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index d16040de18..016603b556 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -56,7 +57,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004274129867553711 [0m
+[1;32mDEBUG: model prefixing  takes 0.001806020736694336 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -165,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.059 s
+8 processes with 40 diagrams generated in 0.080 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -183,45 +184,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.037 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.105 s
+ALOHA: aloha creates 2 routines in  0.081 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.535s
-user	0m0.481s
-sys	0m0.048s
-Code generation completed in 1 seconds
+real	0m2.356s
+user	0m0.505s
+sys	0m0.153s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index e57428e73e..0bf3c7da89 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -782,9 +843,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -860,8 +920,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -923,25 +982,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -986,16 +1055,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1063,6 +1160,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1072,6 +1170,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1083,8 +1183,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1112,7 +1214,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1128,7 +1229,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1142,11 +1243,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1158,6 +1262,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1166,9 +1271,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1179,9 +1285,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1219,40 +1331,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1267,7 +1345,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1315,82 +1394,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1415,13 +1509,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1429,13 +1517,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index bd42537623..0bf2e4625f 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h
new file mode 100644
index 0000000000..fe66e4e760
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H
+#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index 57dd4fee2d..959f91ae65 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -782,9 +843,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -860,8 +920,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -923,25 +982,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -986,16 +1055,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1063,6 +1160,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1072,6 +1170,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1083,8 +1183,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1112,7 +1214,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1128,7 +1229,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1142,11 +1243,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1158,6 +1262,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1166,9 +1271,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1179,9 +1285,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1219,40 +1331,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1267,7 +1345,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1315,82 +1394,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1415,13 +1509,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1429,13 +1517,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index dd4aae8a06..9191598e88 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h
new file mode 100644
index 0000000000..89823b9d1d
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H
+#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
index faef5b2d67..11b54f703e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -129,14 +130,14 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb 
 INFO: remove old information in CODEGEN_mad_heft_gg_bb 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
@@ -148,55 +149,57 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_bbx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s
-Wrote files for 12 helas calls in 0.062 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s
+Wrote files for 12 helas calls in 2.377 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.193 s
+ALOHA: aloha creates 4 routines in  0.164 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 8 routines in  0.178 s
+ALOHA: aloha creates 8 routines in  0.125 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.118s
-user	0m1.750s
-sys	0m0.364s
-Code generation completed in 2 seconds
+real	0m9.223s
+user	0m1.254s
+sys	0m0.660s
+Code generation completed in 9 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -217,9 +220,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -246,9 +249,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
index 7d7b960511..5e318bc0a4 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_heft.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
index fbb0c2effb..c04f054aa9 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -391,8 +452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFS2_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 2. * amp_sv[0];
 
@@ -404,8 +468,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -418,8 +485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -431,8 +501,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -746,9 +819,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -824,8 +896,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -887,25 +958,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -950,16 +1031,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1027,6 +1136,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1036,6 +1146,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1047,8 +1159,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1076,7 +1190,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1092,7 +1205,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1106,11 +1219,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1122,6 +1238,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1130,9 +1247,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1143,9 +1261,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1183,40 +1307,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1231,7 +1321,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1279,82 +1370,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1379,13 +1485,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1393,13 +1493,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
index a4c60bf837..bfcb1209e9 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h
new file mode 100644
index 0000000000..f7dbd383b0
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_HEFT_GG_BBX_H
+#define MG5_CONFIG_SIGMA_HEFT_GG_BBX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 4;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_HEFT_GG_BBX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
index 5208ed190c..13878ae8fd 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,26 +49,26 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
-[1;60mINFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models [0m
---2025-10-22 11:47:55--  http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz
-Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243
-Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected.
+[1;60mINFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models [0m
+--2025-12-11 12:36:44--  https://madgraph.mi.infn.it/Downloads/models/heft.tgz
+Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75
+Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected.
 HTTP request sent, awaiting response... 200 OK
 Length: 50876 (50K) [application/x-gzip]
 Saving to: ‘tmp.tgz’
 
-     0K .......... .......... .......... .......... ......... 100%  921K=0.05s
+     0K .......... .......... .......... .......... ......... 100% 3.00M=0.02s
 
-2025-10-22 11:47:55 (921 KB/s) - ‘tmp.tgz’ saved [50876/50876]
+2025-12-11 12:36:44 (3.00 MB/s) - ‘tmp.tgz’ saved [50876/50876]
 
 heft/
 heft/write_param_card.py
@@ -104,7 +105,7 @@ INFO: load particles
 INFO: load vertices 
 [1;34mWARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
 [1;34mWARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
-[1;32mDEBUG: model prefixing  takes 0.004904985427856445 [0m
+[1;32mDEBUG: model prefixing  takes 0.0019080638885498047 [0m
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: s u w+ at order: QED=1 [0m
@@ -170,49 +171,49 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~
 generate g g > b b~ HIW<=1
 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1  
 INFO: Process has 4 diagrams 
-1 processes with 4 diagrams generated in 0.004 s
+1 processes with 4 diagrams generated in 0.006 s
 Total: 1 processes with 4 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
-Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
+Generated helas calls for 1 subprocesses (4 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.185 s
+ALOHA: aloha creates 4 routines in  0.134 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 quit
 
-real	0m0.821s
-user	0m0.568s
-sys	0m0.084s
-Code generation completed in 1 seconds
+real	0m2.583s
+user	0m0.606s
+sys	0m0.182s
+Code generation completed in 3 seconds
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
index 7d7b960511..5e318bc0a4 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_heft.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
index 8fc4cf7184..e97d656ef0 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -742,9 +803,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -820,8 +880,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -883,25 +942,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -946,16 +1015,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1023,6 +1120,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1032,6 +1130,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1043,8 +1143,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1072,7 +1174,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1088,7 +1189,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1102,11 +1203,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1118,6 +1222,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1126,9 +1231,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1139,9 +1245,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1179,40 +1291,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1227,7 +1305,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1275,82 +1354,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1375,13 +1469,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1389,13 +1477,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
index a4c60bf837..bfcb1209e9 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h
new file mode 100644
index 0000000000..f7dbd383b0
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_HEFT_GG_BBX_H
+#define MG5_CONFIG_SIGMA_HEFT_GG_BBX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 4;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_HEFT_GG_BBX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
index b5ca9e6bb6..7f8baeac9e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -56,7 +57,7 @@ set zerowidth_tchannel F
 import model sm-no_b_mass
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004863262176513672 [0m
+[1;32mDEBUG: model prefixing  takes 0.0018579959869384766 [0m
 INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -180,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w-
 INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- 
 INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ 
 INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ 
-4 processes with 8 diagrams generated in 0.093 s
+4 processes with 8 diagrams generated in 0.100 s
 Total: 4 processes with 8 diagrams
 add process p p > t t~ w j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -222,21 +223,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~
 INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g 
 INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ 
 INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g 
-12 processes with 144 diagrams generated in 0.520 s
+12 processes with 144 diagrams generated in 0.282 s
 Total: 16 processes with 152 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW 
 INFO: remove old information in CODEGEN_mad_nobm_pp_ttW 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Processing color information for process: g u > t t~ w+ d @1 
@@ -270,9 +271,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gd_ttxwmu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -281,9 +282,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d > t t~ w- u WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gux_ttxwmdx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -292,9 +293,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ w- d~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gdx_ttxwpux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -303,9 +304,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d~ > t t~ w+ u~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_udx_ttxwpg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -314,9 +315,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_dux_ttxwmg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -325,9 +326,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P0_udx_ttxwp 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -336,9 +337,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P0_dux_ttxwm 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -347,21 +348,21 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 8 subprocesses (76 diagrams) in 0.172 s
-Wrote files for 212 helas calls in 0.856 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 8 subprocesses (76 diagrams) in 0.122 s
+Wrote files for 212 helas calls in 17.074 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 3 routines in  0.166 s
+ALOHA: aloha creates 3 routines in  0.141 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 6 routines in  0.150 s
+ALOHA: aloha creates 6 routines in  0.115 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -369,32 +370,34 @@ ALOHA: aloha creates 6 routines in  0.150 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
-INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
+INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
 INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.809s
-user	0m4.082s
-sys	0m0.695s
-Code generation completed in 5 seconds
+real	0m25.842s
+user	0m2.903s
+sys	0m1.311s
+Code generation completed in 26 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -415,9 +418,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -444,9 +447,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
index 71a4c3f155..058c89b2f9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm_no_b_mass.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
index 0893180611..96144a4b95 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -395,8 +456,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -409,8 +473,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -759,9 +826,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -837,8 +903,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -900,25 +965,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -963,16 +1038,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1040,6 +1143,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1049,6 +1153,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1060,8 +1166,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1089,7 +1197,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1105,7 +1212,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1119,11 +1226,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1135,6 +1245,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1143,9 +1254,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1156,9 +1268,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1196,40 +1314,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1244,7 +1328,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1292,82 +1377,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1392,13 +1492,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1406,13 +1500,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
index b9c21cb625..8db125293c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h
new file mode 100644
index 0000000000..4f350b6335
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 2;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
index 2a56cf5ec4..801cc458c3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -395,8 +456,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -409,8 +473,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -759,9 +826,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -837,8 +903,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -900,25 +965,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -963,16 +1038,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1040,6 +1143,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1049,6 +1153,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1060,8 +1166,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1089,7 +1197,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1105,7 +1212,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1119,11 +1226,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1135,6 +1245,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1143,9 +1254,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1156,9 +1268,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1196,40 +1314,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1244,7 +1328,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1292,82 +1377,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1392,13 +1492,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1406,13 +1500,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
index 43024d30fd..88ccb8aebc 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h
new file mode 100644
index 0000000000..38d2d5ed20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 2;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
index 6e87d2186e..733db97179 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -398,8 +459,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -428,8 +495,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -442,8 +512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -457,8 +530,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -471,8 +547,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -485,8 +564,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -499,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -513,8 +598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -527,8 +615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -541,8 +632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -555,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -955,9 +1052,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1033,8 +1129,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1096,25 +1191,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1159,16 +1264,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1236,6 +1369,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1245,6 +1379,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1256,8 +1392,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1285,7 +1423,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1301,7 +1438,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1315,11 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1331,6 +1471,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1339,9 +1480,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1352,9 +1494,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1392,40 +1540,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1440,7 +1554,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1488,82 +1603,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1588,13 +1718,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1602,13 +1726,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
index b168a7dacf..c06c1088d2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h
new file mode 100644
index 0000000000..743c903011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 12;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
index 563e3c6ead..c3009cc038 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -398,8 +459,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -428,8 +495,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -442,8 +512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -457,8 +530,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -471,8 +547,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -485,8 +564,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -499,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -513,8 +598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -527,8 +615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -541,8 +632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -555,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -955,9 +1052,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1033,8 +1129,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1096,25 +1191,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1159,16 +1264,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1236,6 +1369,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1245,6 +1379,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1256,8 +1392,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1285,7 +1423,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1301,7 +1438,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1315,11 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1331,6 +1471,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1339,9 +1480,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1352,9 +1494,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1392,40 +1540,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1440,7 +1554,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1488,82 +1603,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1588,13 +1718,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1602,13 +1726,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
index daa474c26a..649bf473b3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h
new file mode 100644
index 0000000000..0861e7eec7
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 12;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
index d6a510d40d..bb7db7b6ef 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -398,8 +459,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -428,8 +495,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -442,8 +512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -457,8 +530,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -471,8 +547,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -485,8 +564,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -499,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -513,8 +598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -527,8 +615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -541,8 +632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -555,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -955,9 +1052,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1033,8 +1129,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1096,25 +1191,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1159,16 +1264,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1236,6 +1369,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1245,6 +1379,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1256,8 +1392,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1285,7 +1423,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1301,7 +1438,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1315,11 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1331,6 +1471,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1339,9 +1480,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1352,9 +1494,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1392,40 +1540,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1440,7 +1554,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1488,82 +1603,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1588,13 +1718,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1602,13 +1726,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
index d4d5408ad2..5330725977 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h
new file mode 100644
index 0000000000..2e039b079e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 12;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
index 75dc9427fe..d69812222a 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -398,8 +459,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -428,8 +495,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -442,8 +512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -457,8 +530,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -471,8 +547,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -485,8 +564,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -499,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -513,8 +598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -527,8 +615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -541,8 +632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -555,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -955,9 +1052,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1033,8 +1129,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1096,25 +1191,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1159,16 +1264,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1236,6 +1369,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1245,6 +1379,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1256,8 +1392,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1285,7 +1423,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1301,7 +1438,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1315,11 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1331,6 +1471,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1339,9 +1480,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1352,9 +1494,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1392,40 +1540,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1440,7 +1554,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1488,82 +1603,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1588,13 +1718,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1602,13 +1726,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
index 6aaf443f35..362cd39944 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h
new file mode 100644
index 0000000000..8aa1915d04
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 12;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
index 27c3656e02..c9b72712e9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -398,8 +459,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -428,8 +495,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -442,8 +512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -457,8 +530,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -471,8 +547,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -485,8 +564,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -499,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -513,8 +598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -527,8 +615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -541,8 +632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -555,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -955,9 +1052,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1033,8 +1129,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1096,25 +1191,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1159,16 +1264,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1236,6 +1369,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1245,6 +1379,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1256,8 +1392,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1285,7 +1423,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1301,7 +1438,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1315,11 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1331,6 +1471,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1339,9 +1480,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1352,9 +1494,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1392,40 +1540,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1440,7 +1554,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1488,82 +1603,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1588,13 +1718,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1602,13 +1726,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
index a4f9928134..dd4e039f85 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h
new file mode 100644
index 0000000000..6724b700f9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 12;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
index 31f01d963a..7ad6ebf5cb 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,6 +100,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -240,7 +304,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -249,7 +313,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -345,7 +409,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -358,7 +423,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -367,12 +432,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -398,8 +459,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -428,8 +495,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -442,8 +512,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -457,8 +530,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -471,8 +547,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -485,8 +564,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -499,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -513,8 +598,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -527,8 +615,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -541,8 +632,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -555,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -955,9 +1052,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1033,8 +1129,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1096,25 +1191,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1159,16 +1264,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1236,6 +1369,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1245,6 +1379,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1256,8 +1392,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1285,7 +1423,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1301,7 +1438,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1315,11 +1452,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1331,6 +1471,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1339,9 +1480,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1352,9 +1494,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1392,40 +1540,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1440,7 +1554,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1488,82 +1603,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1588,13 +1718,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1602,13 +1726,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
index 7d990b3bbd..2052fbd364 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
@@ -164,6 +164,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -172,6 +173,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -188,6 +191,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -195,6 +199,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h
new file mode 100644
index 0000000000..59ab03987d
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H
+#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 12;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 0da34a0aa2..c814985da4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -56,7 +57,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0046498775482177734 [0m
+[1;32mDEBUG: model prefixing  takes 0.0018219947814941406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -167,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.025 s
+5 processes with 7 diagrams generated in 0.057 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -207,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.114 s
+13 processes with 76 diagrams generated in 0.136 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -373,21 +374,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.872 s
+65 processes with 1119 diagrams generated in 0.886 s
 Total: 83 processes with 1202 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Processing color information for process: g g > t t~ g g @2 
@@ -498,9 +499,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -509,9 +510,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -520,9 +521,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -531,9 +532,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -542,9 +543,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -553,9 +554,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -564,9 +565,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -575,9 +576,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -586,9 +587,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -597,9 +598,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -608,9 +609,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -619,9 +620,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -630,9 +631,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -641,9 +642,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -652,9 +653,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -663,9 +664,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -674,9 +675,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1589][0m [0m
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -685,25 +686,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.392 s
-Wrote files for 810 helas calls in 2.303 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 18 subprocesses (372 diagrams) in 0.701 s
+Wrote files for 810 helas calls in 44.199 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.281 s
+ALOHA: aloha creates 5 routines in  0.191 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.237 s
+ALOHA: aloha creates 10 routines in  0.178 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -716,32 +717,34 @@ ALOHA: aloha creates 10 routines in  0.237 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m10.952s
-user	0m9.707s
-sys	0m1.156s
-Code generation completed in 11 seconds
+real	0m57.768s
+user	0m6.485s
+sys	0m2.494s
+Code generation completed in 58 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -762,9 +765,9 @@ Code generation completed in 11 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -791,9 +794,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
index 2fa0ce29e0..9bc525d8ad 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_sm.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index 2ffa35504b..02f5d7a8eb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -391,8 +452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -405,8 +469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -418,8 +485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -732,9 +802,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -810,8 +879,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -873,25 +941,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -936,16 +1014,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1013,6 +1119,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1022,6 +1129,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1033,8 +1142,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1062,7 +1173,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1078,7 +1188,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1092,11 +1202,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1108,6 +1221,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1116,9 +1230,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1129,9 +1244,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1169,40 +1290,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1217,7 +1304,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1265,82 +1353,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1365,13 +1468,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1379,13 +1476,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index 3100207a11..04a1595fd1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h
new file mode 100644
index 0000000000..8f6a27f796
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 3;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index b1adf10a8d..791fdf32bd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -394,8 +455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -709,9 +773,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -787,8 +850,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -850,25 +912,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -913,16 +985,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -990,6 +1090,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -999,6 +1100,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1010,8 +1113,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1039,7 +1144,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1055,7 +1159,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1069,11 +1173,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1085,6 +1192,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1093,9 +1201,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1106,9 +1215,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1146,40 +1261,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1194,7 +1275,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1242,82 +1324,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1342,13 +1439,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1356,13 +1447,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index bb3daa0e4d..36998a8fa1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h
new file mode 100644
index 0000000000..771b635b93
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTX_H
+#define MG5_CONFIG_SIGMA_SM_UUX_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 1;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UUX_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 7f29af7755..b8f69df605 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -394,8 +455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
@@ -410,8 +474,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -424,8 +491,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -439,8 +509,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -452,8 +525,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -466,8 +542,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -480,8 +559,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
 
@@ -493,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -507,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
 
@@ -520,8 +608,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -534,8 +625,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -548,8 +642,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -564,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
 
@@ -577,8 +677,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
 
@@ -590,8 +693,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -949,9 +1055,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1027,8 +1132,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1090,25 +1194,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1153,16 +1267,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1230,6 +1372,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1239,6 +1382,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1250,8 +1395,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1279,7 +1426,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1295,7 +1441,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1309,11 +1455,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1325,6 +1474,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1333,9 +1483,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1346,9 +1497,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1386,40 +1543,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1434,7 +1557,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1482,82 +1606,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1582,13 +1721,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1596,13 +1729,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 1b49cac30b..1b956214b7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h
new file mode 100644
index 0000000000..47044dbe6a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 16;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index a15b72b642..7c3b3f4b4a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -397,8 +458,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -426,8 +493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -440,8 +510,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -454,8 +527,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -787,9 +863,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -865,8 +940,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -928,25 +1002,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -991,16 +1075,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1068,6 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1077,6 +1190,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1088,8 +1203,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1117,7 +1234,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1133,7 +1249,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1147,11 +1263,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1163,6 +1282,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1171,9 +1291,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1184,9 +1305,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1224,40 +1351,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1272,7 +1365,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1320,82 +1414,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1420,13 +1529,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1434,13 +1537,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index bd42537623..0bf2e4625f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h
new file mode 100644
index 0000000000..fe66e4e760
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H
+#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 2cac6d6802..36ef0f1276 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -397,8 +458,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -426,8 +493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -440,8 +510,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -454,8 +527,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -787,9 +863,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -865,8 +940,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -928,25 +1002,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -991,16 +1075,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1068,6 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1077,6 +1190,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1088,8 +1203,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1117,7 +1234,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1133,7 +1249,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1147,11 +1263,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1163,6 +1282,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1171,9 +1291,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1184,9 +1305,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1224,40 +1351,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1272,7 +1365,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1320,82 +1414,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1420,13 +1529,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1434,13 +1537,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index dd4aae8a06..9191598e88 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h
new file mode 100644
index 0000000000..89823b9d1d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H
+#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index 75c172df70..ab7500dca5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -397,8 +458,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -412,8 +476,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
@@ -426,8 +493,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -440,8 +510,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -454,8 +527,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -787,9 +863,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -865,8 +940,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -928,25 +1002,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -991,16 +1075,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1068,6 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1077,6 +1190,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1088,8 +1203,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1117,7 +1234,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1133,7 +1249,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1147,11 +1263,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1163,6 +1282,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1171,9 +1291,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1184,9 +1305,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1224,40 +1351,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1272,7 +1365,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1320,82 +1414,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1420,13 +1529,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1434,13 +1537,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index 46d25105cc..514325b407 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h
new file mode 100644
index 0000000000..8754e13596
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXG_H
+#define MG5_CONFIG_SIGMA_SM_UUX_TTXG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 5;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index 382d6f340c..c5593feede 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -430,8 +491,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -450,8 +514,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -470,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -491,8 +561,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -505,8 +578,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -521,8 +597,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -535,8 +614,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -549,8 +631,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -565,8 +650,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -579,8 +667,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -593,8 +684,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -609,8 +703,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -623,8 +720,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -639,8 +739,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -655,8 +758,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -673,8 +779,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= amp_sv[0];
 
@@ -686,8 +795,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= amp_sv[0];
 
@@ -699,8 +811,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -714,8 +829,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -730,8 +848,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -744,8 +865,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -758,8 +882,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -774,8 +901,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -788,8 +918,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -802,8 +935,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -815,8 +951,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= amp_sv[0];
 
@@ -828,8 +967,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -841,8 +983,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= amp_sv[0];
 
@@ -854,8 +999,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -868,8 +1016,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
@@ -910,8 +1061,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] -= amp_sv[0];
 
@@ -923,8 +1077,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 34
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[33] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= amp_sv[0];
 
@@ -936,8 +1093,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -950,8 +1110,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -966,8 +1129,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 37
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[36] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -980,8 +1146,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 38
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[37] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -994,8 +1163,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 39
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[38] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -1010,8 +1182,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 40
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[39] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1024,8 +1199,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 41
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[40] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1038,8 +1216,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 42
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[41] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] -= amp_sv[0];
 
@@ -1051,8 +1232,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 43
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[42] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[15] -= amp_sv[0];
 
@@ -1064,8 +1248,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 44
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[43] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[23] -= amp_sv[0];
 
@@ -1077,8 +1264,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 45
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[44] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[21] -= amp_sv[0];
 
@@ -1090,8 +1280,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 46
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[45] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1104,8 +1297,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 47
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[46] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -1143,8 +1339,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 49
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[48] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1157,8 +1356,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 50
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[49] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -1173,8 +1375,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 51
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[50] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1187,8 +1392,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 52
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[51] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1201,8 +1409,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 53
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[52] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -1217,8 +1428,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 54
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[53] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1231,8 +1445,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 55
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[54] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -1247,8 +1464,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 56
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[55] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
@@ -1263,8 +1483,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 57
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[56] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1317,8 +1540,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 59
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[58] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1337,8 +1563,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 60
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[59] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1357,8 +1586,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 61
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[60] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -1373,8 +1605,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 62
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[61] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1387,8 +1622,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 63
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[62] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -1403,8 +1641,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 64
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[63] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1418,8 +1659,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 65
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[64] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1432,8 +1676,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 66
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[65] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -1448,8 +1695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 67
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[66] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1462,8 +1712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 68
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[67] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1476,8 +1729,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 69
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[68] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -1492,8 +1748,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 70
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[69] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1506,8 +1765,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 71
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[70] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -1522,8 +1784,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 72
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[71] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
@@ -1538,8 +1803,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 73
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[72] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1592,8 +1860,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 75
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[74] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1612,8 +1883,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 76
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[75] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1632,8 +1906,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 77
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[76] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -1648,8 +1925,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 78
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[77] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1662,8 +1942,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 79
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[78] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -1678,8 +1961,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 80
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[79] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1692,8 +1978,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 81
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[80] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= amp_sv[0];
 
@@ -1705,8 +1994,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 82
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[81] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= amp_sv[0];
 
@@ -1718,8 +2010,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 83
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[82] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= amp_sv[0];
 
@@ -1731,8 +2026,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 84
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[83] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= amp_sv[0];
 
@@ -1744,8 +2042,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 85
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[84] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1758,8 +2059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 86
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[85] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
@@ -1774,8 +2078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 87
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[86] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] -= amp_sv[0];
 
@@ -1787,8 +2094,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 88
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[87] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[13] -= amp_sv[0];
 
@@ -1800,8 +2110,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 89
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[88] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[22] -= amp_sv[0];
 
@@ -1813,8 +2126,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 90
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[89] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[19] -= amp_sv[0];
 
@@ -1826,8 +2142,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 91
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[90] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1840,8 +2159,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 92
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[91] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
@@ -1890,8 +2212,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 94
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[93] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1910,8 +2235,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 95
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[94] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1930,8 +2258,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 96
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[95] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -1946,8 +2277,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 97
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[96] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1960,8 +2294,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 98
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[97] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
@@ -1976,8 +2313,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 99
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[98] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2024,8 +2364,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 101
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[100] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2044,8 +2387,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 102
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[101] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2064,8 +2410,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 103
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[102] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
@@ -2080,8 +2429,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 104
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[103] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2094,8 +2446,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 105
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[104] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -2110,8 +2465,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 106
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[105] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2158,8 +2516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 108
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[107] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2178,8 +2539,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 109
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[108] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2198,8 +2562,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 110
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[109] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[12] -= amp_sv[0];
 
@@ -2211,8 +2578,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 111
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[110] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[14] -= amp_sv[0];
 
@@ -2224,8 +2594,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 112
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[111] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[18] -= amp_sv[0];
 
@@ -2237,8 +2610,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 113
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[112] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[20] -= amp_sv[0];
 
@@ -2877,9 +3253,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -2955,8 +3330,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -3018,25 +3392,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -3081,16 +3465,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -3158,6 +3570,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -3167,6 +3580,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -3178,8 +3593,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -3207,7 +3624,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -3223,7 +3639,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -3237,11 +3653,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -3253,6 +3672,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -3261,9 +3681,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -3274,9 +3695,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3314,40 +3741,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -3362,7 +3755,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -3410,82 +3804,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -3510,13 +3919,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -3524,13 +3927,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index 0c3370cd1c..c0d59a27ea 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h
new file mode 100644
index 0000000000..4f4a3c3bc0
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 123;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index e5370edc0d..d133fb651d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -448,8 +518,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -464,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -481,8 +557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * amp_sv[0];
       jamp_sv[5] += 1. / 2. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
       jamp_sv[5] += 1. / 2. * amp_sv[0];
@@ -510,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
@@ -524,8 +609,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * amp_sv[0];
       jamp_sv[1] += 1. / 2. * amp_sv[0];
@@ -538,8 +626,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -553,8 +644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 2. * amp_sv[0];
       jamp_sv[8] -= 1. / 6. * amp_sv[0];
@@ -567,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
       jamp_sv[10] += 1. / 2. * amp_sv[0];
@@ -581,8 +678,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
       jamp_sv[6] += 1. / 2. * amp_sv[0];
@@ -595,8 +695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 6. * amp_sv[0];
       jamp_sv[10] += 1. / 2. * amp_sv[0];
@@ -609,8 +712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -624,8 +730,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 2. * amp_sv[0];
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
@@ -638,8 +747,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * amp_sv[0];
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
@@ -652,8 +764,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 6. * amp_sv[0];
       jamp_sv[6] += 1. / 2. * amp_sv[0];
@@ -666,8 +781,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * amp_sv[0];
       jamp_sv[3] -= 1. / 6. * amp_sv[0];
@@ -680,8 +798,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -695,8 +816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
       jamp_sv[9] += 1. / 2. * amp_sv[0];
@@ -709,8 +833,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * amp_sv[0];
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
@@ -723,8 +850,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * amp_sv[0];
       jamp_sv[11] -= 1. / 6. * amp_sv[0];
@@ -737,8 +867,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += 1. / 2. * amp_sv[0];
       jamp_sv[11] -= 1. / 6. * amp_sv[0];
@@ -751,8 +884,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -765,8 +901,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 6. * amp_sv[0];
       jamp_sv[9] += 1. / 2. * amp_sv[0];
@@ -779,8 +918,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -793,8 +935,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * amp_sv[0];
       jamp_sv[2] += 1. / 2. * amp_sv[0];
@@ -807,8 +952,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -821,8 +969,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += 1. / 2. * amp_sv[0];
       jamp_sv[11] -= 1. / 6. * amp_sv[0];
@@ -835,8 +986,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -849,8 +1003,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 32
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[31] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[3] -= 1. / 6. * amp_sv[0];
@@ -863,8 +1020,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -899,8 +1059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -915,8 +1078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -1284,9 +1450,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1362,8 +1527,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1425,25 +1589,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1488,16 +1662,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1565,6 +1767,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1574,6 +1777,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1585,8 +1790,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1614,7 +1821,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1630,7 +1836,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1644,11 +1850,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1660,6 +1869,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1668,9 +1878,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1681,9 +1892,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1721,40 +1938,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1769,7 +1952,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1817,82 +2001,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1917,13 +2116,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1931,13 +2124,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index 471c526c49..abcc2d6233 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h
new file mode 100644
index 0000000000..e51eb2c6c2
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H
+#define MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 36;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index 1fdfcee1ed..e2ac5942a4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[5] += 1. / 2. * amp_sv[0];
@@ -448,8 +518,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -464,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -481,8 +557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -510,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -524,8 +609,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 6. * amp_sv[0];
@@ -538,8 +626,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -553,8 +644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -567,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
@@ -581,8 +678,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
@@ -595,8 +695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -609,8 +712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -624,8 +730,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 6. * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
@@ -638,8 +747,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
       jamp_sv[6] += 1. / 6. * amp_sv[0];
@@ -652,8 +764,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 6. * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
@@ -666,8 +781,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 6. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -680,8 +798,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -695,8 +816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 6. * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
@@ -709,8 +833,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[6] += 1. / 6. * amp_sv[0];
@@ -723,8 +850,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[9] += 1. / 6. * amp_sv[0];
@@ -737,8 +867,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += 1. / 6. * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
@@ -751,8 +884,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -765,8 +901,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -779,8 +918,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -793,8 +935,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 6. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -807,8 +952,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -821,8 +969,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
       jamp_sv[9] += 1. / 6. * amp_sv[0];
@@ -835,8 +986,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -849,8 +1003,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 32
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[31] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[4] += 1. / 6. * amp_sv[0];
@@ -863,8 +1020,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -899,8 +1059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -915,8 +1078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -1284,9 +1450,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1362,8 +1527,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1425,25 +1589,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1488,16 +1662,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1565,6 +1767,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1574,6 +1777,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1585,8 +1790,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1614,7 +1821,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1630,7 +1836,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1644,11 +1850,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1660,6 +1869,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1668,9 +1878,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1681,9 +1892,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1721,40 +1938,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1769,7 +1952,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1817,82 +2001,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1917,13 +2116,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1931,13 +2124,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index 0afe32f972..cedb7fcb32 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h
new file mode 100644
index 0000000000..f7b8795d98
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXGU_H
+#define MG5_CONFIG_SIGMA_SM_GU_TTXGU_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 36;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GU_TTXGU_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index bc0fc369d4..58def243d0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
       jamp_sv[7] += 1. / 2. * amp_sv[0];
@@ -448,8 +518,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -464,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -481,8 +557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 2. * amp_sv[0];
       jamp_sv[6] -= 1. / 6. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -510,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[7] += 1. / 2. * amp_sv[0];
@@ -524,8 +609,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= 1. / 6. * amp_sv[0];
       jamp_sv[7] += 1. / 2. * amp_sv[0];
@@ -538,8 +626,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -553,8 +644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[9] -= 1. / 6. * amp_sv[0];
@@ -567,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
       jamp_sv[11] += 1. / 2. * amp_sv[0];
@@ -581,8 +678,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * amp_sv[0];
@@ -595,8 +695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= 1. / 6. * amp_sv[0];
       jamp_sv[11] += 1. / 2. * amp_sv[0];
@@ -609,8 +712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -624,8 +730,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
@@ -638,8 +747,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[4] += 1. / 2. * amp_sv[0];
@@ -652,8 +764,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 6. * amp_sv[0];
@@ -666,8 +781,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 6. * amp_sv[0];
@@ -680,8 +798,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -695,8 +816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[8] += 1. / 2. * amp_sv[0];
@@ -709,8 +833,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -723,8 +850,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 2. * amp_sv[0];
       jamp_sv[10] -= 1. / 6. * amp_sv[0];
@@ -737,8 +867,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += 1. / 2. * amp_sv[0];
       jamp_sv[10] -= 1. / 6. * amp_sv[0];
@@ -751,8 +884,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -765,8 +901,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += 1. / 2. * amp_sv[0];
       jamp_sv[9] -= 1. / 6. * amp_sv[0];
@@ -779,8 +918,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -793,8 +935,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 2. * amp_sv[0];
       jamp_sv[6] -= 1. / 6. * amp_sv[0];
@@ -807,8 +952,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -821,8 +969,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= 1. / 6. * amp_sv[0];
       jamp_sv[11] += 1. / 2. * amp_sv[0];
@@ -835,8 +986,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -849,8 +1003,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 32
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[31] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 6. * amp_sv[0];
       jamp_sv[7] += 1. / 2. * amp_sv[0];
@@ -863,8 +1020,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -899,8 +1059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -915,8 +1078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -1284,9 +1450,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1362,8 +1527,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1425,25 +1589,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1488,16 +1662,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1565,6 +1767,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1574,6 +1777,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1585,8 +1790,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1614,7 +1821,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1630,7 +1836,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1644,11 +1850,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1660,6 +1869,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1668,9 +1878,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1681,9 +1892,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1721,40 +1938,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1769,7 +1952,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1817,82 +2001,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1917,13 +2116,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1931,13 +2124,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index 949f0f0e2a..a7b234154a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h
new file mode 100644
index 0000000000..94a05c400f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H
+#define MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 36;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index c691d758e7..cc26513453 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -103,6 +104,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -244,7 +308,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -253,7 +317,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -349,7 +413,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -362,7 +427,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -371,12 +436,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -402,8 +463,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -418,8 +482,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -434,8 +501,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -448,8 +518,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -464,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 4. * amp_sv[0];
@@ -480,8 +556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 4. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -865,9 +947,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -943,8 +1024,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1006,25 +1086,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1069,16 +1159,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1146,6 +1264,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1155,6 +1274,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1166,8 +1287,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1195,7 +1318,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1211,7 +1333,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1225,11 +1347,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1241,6 +1366,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1249,9 +1375,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1262,9 +1389,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1302,40 +1435,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1350,7 +1449,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1398,82 +1498,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1498,13 +1613,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1512,13 +1621,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 25aacba743..23cb81ba6a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -168,6 +168,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -176,6 +177,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -192,6 +195,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -199,6 +203,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h
new file mode 100644
index 0000000000..9a17b225fd
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UC_TTXUC_H
+#define MG5_CONFIG_SIGMA_SM_UC_TTXUC_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 7;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UC_TTXUC_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index ca438c57e9..9603b4f631 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -109,6 +110,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -250,7 +314,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -259,7 +323,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -355,7 +419,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -368,7 +433,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -377,12 +442,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -408,8 +469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -424,8 +488,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -440,8 +507,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -454,8 +524,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -470,8 +543,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -486,8 +562,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -502,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -871,9 +953,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -949,8 +1030,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1012,25 +1092,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1075,16 +1165,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1152,6 +1270,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1161,6 +1280,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1172,8 +1293,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1201,7 +1324,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1217,7 +1339,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1231,11 +1353,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1247,6 +1372,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1255,9 +1381,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1268,9 +1395,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1308,40 +1441,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1356,7 +1455,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1404,82 +1504,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1504,13 +1619,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1518,13 +1627,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index ac3df5ae1d..05b5116162 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -174,6 +174,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -182,6 +183,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -198,6 +201,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -205,6 +209,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h
new file mode 100644
index 0000000000..f52e249e91
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H
+#define MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 7;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 7603295c95..a72a6225b4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -446,8 +516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -462,8 +535,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 4. * amp_sv[0];
@@ -480,8 +556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -512,8 +594,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -526,8 +611,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -542,8 +630,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -558,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -574,8 +668,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -590,8 +687,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 4. * amp_sv[0];
@@ -606,8 +706,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -975,9 +1078,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1053,8 +1155,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1116,25 +1217,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1179,16 +1290,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1256,6 +1395,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1265,6 +1405,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1276,8 +1418,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1305,7 +1449,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1321,7 +1464,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1335,11 +1478,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1351,6 +1497,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1359,9 +1506,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1372,9 +1520,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1412,40 +1566,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1460,7 +1580,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1508,82 +1629,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1608,13 +1744,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1622,13 +1752,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index 8af0c5a78c..70a92da32a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h
new file mode 100644
index 0000000000..8f10a6d734
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UU_TTXUU_H
+#define MG5_CONFIG_SIGMA_SM_UU_TTXUU_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 14;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UU_TTXUU_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index 77d7eddc6c..83505e62ba 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -109,6 +110,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -250,7 +314,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -259,7 +323,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -355,7 +419,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -368,7 +433,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -377,12 +442,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -408,8 +469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -424,8 +488,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -440,8 +507,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -454,8 +524,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -470,8 +543,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -486,8 +562,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -502,8 +581,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -871,9 +953,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -949,8 +1030,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1012,25 +1092,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1075,16 +1165,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1152,6 +1270,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1161,6 +1280,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1172,8 +1293,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1201,7 +1324,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1217,7 +1339,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1231,11 +1353,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1247,6 +1372,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1255,9 +1381,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1268,9 +1395,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1308,40 +1441,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1356,7 +1455,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1404,82 +1504,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1504,13 +1619,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1518,13 +1627,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index f37d8b5515..b7ffff9d65 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -174,6 +174,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -182,6 +183,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -198,6 +201,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -205,6 +209,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h
new file mode 100644
index 0000000000..2dfae1920f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H
+#define MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 7;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 4f2c72bff8..4c8f471aba 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -448,8 +518,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -464,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -481,8 +557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -510,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -524,8 +609,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -538,8 +626,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -553,8 +644,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * amp_sv[0];
@@ -567,8 +661,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -581,8 +678,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] -= 1. / 2. * amp_sv[0];
       jamp_sv[11] += 1. / 6. * amp_sv[0];
@@ -595,8 +695,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -609,8 +712,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -624,8 +730,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += 1. / 6. * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * amp_sv[0];
@@ -638,8 +747,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
       jamp_sv[8] += 1. / 6. * amp_sv[0];
@@ -652,8 +764,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 6. * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * amp_sv[0];
@@ -666,8 +781,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
       jamp_sv[6] += 1. / 6. * amp_sv[0];
@@ -680,8 +798,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -695,8 +816,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
       jamp_sv[8] += 1. / 6. * amp_sv[0];
@@ -709,8 +833,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] += 1. / 6. * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
@@ -723,8 +850,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += 1. / 6. * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
@@ -737,8 +867,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
       jamp_sv[7] += 1. / 6. * amp_sv[0];
@@ -751,8 +884,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -765,8 +901,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -779,8 +918,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -793,8 +935,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * amp_sv[0];
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
@@ -807,8 +952,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -821,8 +969,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
       jamp_sv[7] += 1. / 6. * amp_sv[0];
@@ -835,8 +986,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -849,8 +1003,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 32
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[31] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[6] += 1. / 6. * amp_sv[0];
@@ -863,8 +1020,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -899,8 +1059,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -915,8 +1078,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
@@ -1284,9 +1450,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1362,8 +1527,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1425,25 +1589,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1488,16 +1662,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1565,6 +1767,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1574,6 +1777,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1585,8 +1790,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1614,7 +1821,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1630,7 +1836,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1644,11 +1850,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1660,6 +1869,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1668,9 +1878,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1681,9 +1892,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1721,40 +1938,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1769,7 +1952,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1817,82 +2001,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1917,13 +2116,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1931,13 +2124,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 311a96d812..8be5530c1c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h
new file mode 100644
index 0000000000..25081a00da
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H
+#define MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 36;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index 65f0e5aaf4..25cb87562c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -446,8 +516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -462,8 +535,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -480,8 +556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -512,8 +594,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -526,8 +611,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -542,8 +630,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -558,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -574,8 +668,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -590,8 +687,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -606,8 +706,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 36. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -975,9 +1078,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1053,8 +1155,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1116,25 +1217,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1179,16 +1290,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1256,6 +1395,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1265,6 +1405,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1276,8 +1418,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1305,7 +1449,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1321,7 +1464,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1335,11 +1478,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1351,6 +1497,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1359,9 +1506,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1372,9 +1520,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1412,40 +1566,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1460,7 +1580,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1508,82 +1629,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1608,13 +1744,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1622,13 +1752,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index 75597d043e..13ce403cae 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h
new file mode 100644
index 0000000000..e6c319de76
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H
+#define MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 14;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index d938dc9999..5096994855 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -103,6 +104,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -244,7 +308,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -253,7 +317,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -349,7 +413,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -362,7 +427,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -371,12 +436,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -402,8 +463,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -418,8 +482,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -434,8 +501,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -448,8 +518,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 36. * amp_sv[0];
@@ -464,8 +537,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -480,8 +556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 36. * amp_sv[0];
@@ -865,9 +947,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -943,8 +1024,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1006,25 +1086,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1069,16 +1159,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1146,6 +1264,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1155,6 +1274,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1166,8 +1287,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1195,7 +1318,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1211,7 +1333,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1225,11 +1347,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1241,6 +1366,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1249,9 +1375,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1262,9 +1389,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1302,40 +1435,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1350,7 +1449,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1398,82 +1498,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1498,13 +1613,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1512,13 +1621,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index ebbade848b..f4abe8c1e9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -168,6 +168,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -176,6 +177,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -192,6 +195,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -199,6 +203,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h
new file mode 100644
index 0000000000..7adcbeb7fc
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H
+#define MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 7;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index ef2de04fdb..44dceb663b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,6 +102,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -242,7 +306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -251,7 +315,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -347,7 +411,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -360,7 +425,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -369,12 +434,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -416,8 +480,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -432,8 +499,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -446,8 +516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 36. * amp_sv[0];
@@ -462,8 +535,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[2] -= 1. / 12. * amp_sv[0];
@@ -480,8 +556,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -496,8 +575,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -512,8 +594,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
@@ -526,8 +611,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
@@ -542,8 +630,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 12. * amp_sv[0];
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
@@ -558,8 +649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -574,8 +668,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 4. * amp_sv[0];
       jamp_sv[3] += 1. / 12. * amp_sv[0];
@@ -590,8 +687,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 4. * amp_sv[0];
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
@@ -606,8 +706,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 12. * amp_sv[0];
       jamp_sv[3] += 1. / 36. * amp_sv[0];
@@ -975,9 +1078,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1053,8 +1155,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1116,25 +1217,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1179,16 +1290,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1256,6 +1395,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1265,6 +1405,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1276,8 +1418,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1305,7 +1449,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1321,7 +1464,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1335,11 +1478,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1351,6 +1497,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1359,9 +1506,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1372,9 +1520,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1412,40 +1566,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1460,7 +1580,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1508,82 +1629,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1608,13 +1744,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1622,13 +1752,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 17c9c0faf1..e4b749f215 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -166,6 +166,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -174,6 +175,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -190,6 +193,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -197,6 +201,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h
new file mode 100644
index 0000000000..368fc584e6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H
+#define MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 14;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
index e728335e4c..ddfce4015b 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
 INFO: load vertices 
@@ -72,7 +73,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.07860422134399414 [0m
+[1;32mDEBUG: model prefixing  takes 0.034606218338012695 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -87,21 +88,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 2.729 s
+1 processes with 72 diagrams generated in 1.407 s
 Total: 1 processes with 72 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt 
 INFO: remove old information in CODEGEN_mad_smeft_gg_tttt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
@@ -113,25 +114,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.132 s
-Wrote files for 119 helas calls in 0.360 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.068 s
+Wrote files for 119 helas calls in 3.822 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.215 s
+ALOHA: aloha creates 5 routines in  0.160 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 10 routines in  0.214 s
+ALOHA: aloha creates 10 routines in  0.152 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -141,32 +142,34 @@ ALOHA: aloha creates 10 routines in  0.214 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.833s
-user	0m5.426s
-sys	0m0.391s
-Code generation completed in 6 seconds
+real	0m13.208s
+user	0m3.391s
+sys	0m0.650s
+Code generation completed in 13 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -187,9 +190,9 @@ Code generation completed in 6 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -216,9 +219,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat
index 7758c3603b..ee875f040f 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat
@@ -216,17 +216,19 @@ decay 23 mdl_WZ
 decay 24 mdl_WW
 decay 25 mdl_WH
 decay 6 mdl_WT
+mass 1 mdl_MD
 mass 11 mdl_Me
 mass 13 mdl_MMU
 mass 15 mdl_MTA
-mass 1 mdl_MD
+mass 2 mdl_MU
 mass 23 mdl_MZ
 mass 25 mdl_MH
-mass 2 mdl_MU
 mass 3 mdl_MS
 mass 4 mdl_MC
 mass 5 mdl_MB
 mass 6 mdl_MT
+smeft 1 mdl_cG
+smeft 10 mdl_cuHRe
 smeft 100 mdl_ceHRe
 smeft 101 mdl_ceWRe
 smeft 102 mdl_ceBRe
@@ -237,7 +239,7 @@ smeft 106 mdl_cll
 smeft 107 mdl_cll1
 smeft 108 mdl_clj1
 smeft 109 mdl_clj3
-smeft 10 mdl_cuHRe
+smeft 11 mdl_ctHRe
 smeft 110 mdl_cQl1
 smeft 111 mdl_cQl3
 smeft 112 mdl_cee
@@ -248,7 +250,7 @@ smeft 116 mdl_cbe
 smeft 117 mdl_cje
 smeft 118 mdl_cQe
 smeft 119 mdl_clu
-smeft 11 mdl_ctHRe
+smeft 12 mdl_cdHRe
 smeft 120 mdl_ctl
 smeft 121 mdl_cld
 smeft 122 mdl_cbl
@@ -259,7 +261,6 @@ smeft 126 mdl_cleju1Re
 smeft 127 mdl_cleQt1Re
 smeft 128 mdl_cleju3Re
 smeft 129 mdl_cleQt3Re
-smeft 12 mdl_cdHRe
 smeft 13 mdl_cbHRe
 smeft 14 mdl_cuGRe
 smeft 15 mdl_ctGRe
@@ -267,7 +268,7 @@ smeft 16 mdl_cuWRe
 smeft 17 mdl_ctWRe
 smeft 18 mdl_cuBRe
 smeft 19 mdl_ctBRe
-smeft 1 mdl_cG
+smeft 2 mdl_cW
 smeft 20 mdl_cdGRe
 smeft 21 mdl_cbGRe
 smeft 22 mdl_cdWRe
@@ -278,7 +279,7 @@ smeft 26 mdl_cHj1
 smeft 27 mdl_cHQ1
 smeft 28 mdl_cHj3
 smeft 29 mdl_cHQ3
-smeft 2 mdl_cW
+smeft 3 mdl_cH
 smeft 30 mdl_cHu
 smeft 31 mdl_cHt
 smeft 32 mdl_cHd
@@ -289,7 +290,7 @@ smeft 36 mdl_cjj11
 smeft 37 mdl_cjj18
 smeft 38 mdl_cjj31
 smeft 39 mdl_cjj38
-smeft 3 mdl_cH
+smeft 4 mdl_cHbox
 smeft 40 mdl_cQj11
 smeft 41 mdl_cQj18
 smeft 42 mdl_cQj31
@@ -300,7 +301,7 @@ smeft 46 mdl_cuu1
 smeft 47 mdl_cuu8
 smeft 48 mdl_ctt
 smeft 49 mdl_ctu1
-smeft 4 mdl_cHbox
+smeft 5 mdl_cHDD
 smeft 50 mdl_ctu8
 smeft 51 mdl_cdd1
 smeft 52 mdl_cdd8
@@ -311,7 +312,7 @@ smeft 56 mdl_cud1
 smeft 57 mdl_ctb1
 smeft 58 mdl_ctd1
 smeft 59 mdl_cbu1
-smeft 5 mdl_cHDD
+smeft 6 mdl_cHG
 smeft 60 mdl_cud8
 smeft 61 mdl_ctb8
 smeft 62 mdl_ctd8
@@ -322,7 +323,7 @@ smeft 66 mdl_cju1
 smeft 67 mdl_cQu1
 smeft 68 mdl_cju8
 smeft 69 mdl_cQu8
-smeft 6 mdl_cHG
+smeft 7 mdl_cHW
 smeft 70 mdl_ctj1
 smeft 71 mdl_ctj8
 smeft 72 mdl_cQt1
@@ -333,7 +334,7 @@ smeft 76 mdl_cQd1
 smeft 77 mdl_cQd8
 smeft 78 mdl_cbj1
 smeft 79 mdl_cbj8
-smeft 7 mdl_cHW
+smeft 8 mdl_cHB
 smeft 80 mdl_cQb1
 smeft 81 mdl_cQb8
 smeft 82 mdl_cjQtu1Re
@@ -344,7 +345,7 @@ smeft 86 mdl_cjujd1Re
 smeft 87 mdl_cjujd8Re
 smeft 88 mdl_cjujd11Re
 smeft 89 mdl_cjujd81Re
-smeft 8 mdl_cHB
+smeft 9 mdl_cHWB
 smeft 90 mdl_cQtjd1Re
 smeft 91 mdl_cQtjd8Re
 smeft 92 mdl_cjuQb1Re
@@ -355,7 +356,7 @@ smeft 96 mdl_cjtQd1Re
 smeft 97 mdl_cjtQd8Re
 smeft 98 mdl_cQtQb1Re
 smeft 99 mdl_cQtQb8Re
-smeft 9 mdl_cHWB
+smeftcpv 1 mdl_cGtil
 smeftcpv 10 mdl_ctWIm
 smeftcpv 11 mdl_cuBIm
 smeftcpv 12 mdl_ctBIm
@@ -366,7 +367,7 @@ smeftcpv 16 mdl_cbWIm
 smeftcpv 17 mdl_cdBIm
 smeftcpv 18 mdl_cbBIm
 smeftcpv 19 mdl_cuHIm
-smeftcpv 1 mdl_cGtil
+smeftcpv 2 mdl_cWtil
 smeftcpv 20 mdl_ctHIm
 smeftcpv 21 mdl_cdHIm
 smeftcpv 22 mdl_cbHIm
@@ -377,7 +378,7 @@ smeftcpv 26 mdl_cutbd8Im
 smeftcpv 27 mdl_cjQtu1Im
 smeftcpv 28 mdl_cjQtu8Im
 smeftcpv 29 mdl_cjQbd1Im
-smeftcpv 2 mdl_cWtil
+smeftcpv 3 mdl_cHGtil
 smeftcpv 30 mdl_cjQbd8Im
 smeftcpv 31 mdl_cjujd1Im
 smeftcpv 32 mdl_cjujd8Im
@@ -388,7 +389,7 @@ smeftcpv 36 mdl_cQtjd8Im
 smeftcpv 37 mdl_cjuQb1Im
 smeftcpv 38 mdl_cjuQb8Im
 smeftcpv 39 mdl_cQujb1Im
-smeftcpv 3 mdl_cHGtil
+smeftcpv 4 mdl_cHWtil
 smeftcpv 40 mdl_cQujb8Im
 smeftcpv 41 mdl_cjtQd1Im
 smeftcpv 42 mdl_cjtQd8Im
@@ -399,12 +400,11 @@ smeftcpv 46 mdl_ceWIm
 smeftcpv 47 mdl_ceBIm
 smeftcpv 48 mdl_cledjIm
 smeftcpv 49 mdl_clebQIm
-smeftcpv 4 mdl_cHWtil
+smeftcpv 5 mdl_cHBtil
 smeftcpv 50 mdl_cleju1Im
 smeftcpv 51 mdl_cleju3Im
 smeftcpv 52 mdl_cleQt1Im
 smeftcpv 53 mdl_cleQt3Im
-smeftcpv 5 mdl_cHBtil
 smeftcpv 6 mdl_cHWBtil
 smeftcpv 7 mdl_cuGIm
 smeftcpv 8 mdl_ctGIm
@@ -414,10 +414,10 @@ sminputs 1 mdl_MW
 sminputs 2 mdl_Gf
 sminputs 3 aS
 switches 1 mdl_linearPropCorrections
+yukawa 1 mdl_ymdo
 yukawa 11 mdl_yme
 yukawa 13 mdl_ymm
 yukawa 15 mdl_ymtau
-yukawa 1 mdl_ymdo
 yukawa 2 mdl_ymup
 yukawa 3 mdl_yms
 yukawa 4 mdl_ymc
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc
index dab3aac603..e7bc7ae438 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc
@@ -2,17 +2,19 @@
       MDL_WW = 2.085000D+00
       MDL_WH = 4.070000D-03
       MDL_WT = 1.330000D+00
+      MDL_MD = 4.670000D-03
       MDL_ME = 5.110000D-04
       MDL_MMU = 1.056600D-01
       MDL_MTA = 1.777000D+00
-      MDL_MD = 4.670000D-03
+      MDL_MU = 2.160000D-03
       MDL_MZ = 9.118760D+01
       MDL_MH = 1.250900D+02
-      MDL_MU = 2.160000D-03
       MDL_MS = 9.300000D-02
       MDL_MC = 1.270000D+00
       MDL_MB = 4.180000D+00
       MDL_MT = 1.727600D+02
+      MDL_CG = 0.000000D+00
+      MDL_CUHRE = 0.000000D+00
       MDL_CEHRE = 0.000000D+00
       MDL_CEWRE = 0.000000D+00
       MDL_CEBRE = 0.000000D+00
@@ -23,7 +25,7 @@
       MDL_CLL1 = 0.000000D+00
       MDL_CLJ1 = 0.000000D+00
       MDL_CLJ3 = 0.000000D+00
-      MDL_CUHRE = 0.000000D+00
+      MDL_CTHRE = 0.000000D+00
       MDL_CQL1 = 0.000000D+00
       MDL_CQL3 = 0.000000D+00
       MDL_CEE = 0.000000D+00
@@ -34,7 +36,7 @@
       MDL_CJE = 0.000000D+00
       MDL_CQE = 0.000000D+00
       MDL_CLU = 0.000000D+00
-      MDL_CTHRE = 0.000000D+00
+      MDL_CDHRE = 0.000000D+00
       MDL_CTL = 0.000000D+00
       MDL_CLD = 0.000000D+00
       MDL_CBL = 0.000000D+00
@@ -45,7 +47,6 @@
       MDL_CLEQT1RE = 0.000000D+00
       MDL_CLEJU3RE = 0.000000D+00
       MDL_CLEQT3RE = 0.000000D+00
-      MDL_CDHRE = 0.000000D+00
       MDL_CBHRE = 0.000000D+00
       MDL_CUGRE = 0.000000D+00
       MDL_CTGRE = 0.000000D+00
@@ -53,7 +54,7 @@
       MDL_CTWRE = 0.000000D+00
       MDL_CUBRE = 0.000000D+00
       MDL_CTBRE = 0.000000D+00
-      MDL_CG = 0.000000D+00
+      MDL_CW = 0.000000D+00
       MDL_CDGRE = 0.000000D+00
       MDL_CBGRE = 0.000000D+00
       MDL_CDWRE = 0.000000D+00
@@ -64,7 +65,7 @@
       MDL_CHQ1 = 0.000000D+00
       MDL_CHJ3 = 0.000000D+00
       MDL_CHQ3 = 0.000000D+00
-      MDL_CW = 0.000000D+00
+      MDL_CH = 0.000000D+00
       MDL_CHU = 0.000000D+00
       MDL_CHT = 0.000000D+00
       MDL_CHD = 0.000000D+00
@@ -75,7 +76,7 @@
       MDL_CJJ18 = 0.000000D+00
       MDL_CJJ31 = 0.000000D+00
       MDL_CJJ38 = 0.000000D+00
-      MDL_CH = 0.000000D+00
+      MDL_CHBOX = 0.000000D+00
       MDL_CQJ11 = 0.000000D+00
       MDL_CQJ18 = 0.000000D+00
       MDL_CQJ31 = 0.000000D+00
@@ -86,7 +87,7 @@
       MDL_CUU8 = 0.000000D+00
       MDL_CTT = 0.000000D+00
       MDL_CTU1 = 0.000000D+00
-      MDL_CHBOX = 0.000000D+00
+      MDL_CHDD = 0.000000D+00
       MDL_CTU8 = 0.000000D+00
       MDL_CDD1 = 0.000000D+00
       MDL_CDD8 = 0.000000D+00
@@ -97,7 +98,7 @@
       MDL_CTB1 = 0.000000D+00
       MDL_CTD1 = 0.000000D+00
       MDL_CBU1 = 0.000000D+00
-      MDL_CHDD = 0.000000D+00
+      MDL_CHG = 0.000000D+00
       MDL_CUD8 = 0.000000D+00
       MDL_CTB8 = 0.000000D+00
       MDL_CTD8 = 0.000000D+00
@@ -108,7 +109,7 @@
       MDL_CQU1 = 0.000000D+00
       MDL_CJU8 = 0.000000D+00
       MDL_CQU8 = 0.000000D+00
-      MDL_CHG = 0.000000D+00
+      MDL_CHW = 0.000000D+00
       MDL_CTJ1 = 0.000000D+00
       MDL_CTJ8 = 0.000000D+00
       MDL_CQT1 = 0.000000D+00
@@ -119,7 +120,7 @@
       MDL_CQD8 = 0.000000D+00
       MDL_CBJ1 = 0.000000D+00
       MDL_CBJ8 = 0.000000D+00
-      MDL_CHW = 0.000000D+00
+      MDL_CHB = 0.000000D+00
       MDL_CQB1 = 0.000000D+00
       MDL_CQB8 = 0.000000D+00
       MDL_CJQTU1RE = 0.000000D+00
@@ -130,7 +131,7 @@
       MDL_CJUJD8RE = 0.000000D+00
       MDL_CJUJD11RE = 0.000000D+00
       MDL_CJUJD81RE = 0.000000D+00
-      MDL_CHB = 0.000000D+00
+      MDL_CHWB = 0.000000D+00
       MDL_CQTJD1RE = 0.000000D+00
       MDL_CQTJD8RE = 0.000000D+00
       MDL_CJUQB1RE = 0.000000D+00
@@ -141,7 +142,7 @@
       MDL_CJTQD8RE = 0.000000D+00
       MDL_CQTQB1RE = 0.000000D+00
       MDL_CQTQB8RE = 0.000000D+00
-      MDL_CHWB = 0.000000D+00
+      MDL_CGTIL = 0.000000D+00
       MDL_CTWIM = 0.000000D+00
       MDL_CUBIM = 0.000000D+00
       MDL_CTBIM = 0.000000D+00
@@ -152,7 +153,7 @@
       MDL_CDBIM = 0.000000D+00
       MDL_CBBIM = 0.000000D+00
       MDL_CUHIM = 0.000000D+00
-      MDL_CGTIL = 0.000000D+00
+      MDL_CWTIL = 0.000000D+00
       MDL_CTHIM = 0.000000D+00
       MDL_CDHIM = 0.000000D+00
       MDL_CBHIM = 0.000000D+00
@@ -163,7 +164,7 @@
       MDL_CJQTU1IM = 0.000000D+00
       MDL_CJQTU8IM = 0.000000D+00
       MDL_CJQBD1IM = 0.000000D+00
-      MDL_CWTIL = 0.000000D+00
+      MDL_CHGTIL = 0.000000D+00
       MDL_CJQBD8IM = 0.000000D+00
       MDL_CJUJD1IM = 0.000000D+00
       MDL_CJUJD8IM = 0.000000D+00
@@ -174,7 +175,7 @@
       MDL_CJUQB1IM = 0.000000D+00
       MDL_CJUQB8IM = 0.000000D+00
       MDL_CQUJB1IM = 0.000000D+00
-      MDL_CHGTIL = 0.000000D+00
+      MDL_CHWTIL = 0.000000D+00
       MDL_CQUJB8IM = 0.000000D+00
       MDL_CJTQD1IM = 0.000000D+00
       MDL_CJTQD8IM = 0.000000D+00
@@ -185,12 +186,11 @@
       MDL_CEBIM = 0.000000D+00
       MDL_CLEDJIM = 0.000000D+00
       MDL_CLEBQIM = 0.000000D+00
-      MDL_CHWTIL = 0.000000D+00
+      MDL_CHBTIL = 0.000000D+00
       MDL_CLEJU1IM = 0.000000D+00
       MDL_CLEJU3IM = 0.000000D+00
       MDL_CLEQT1IM = 0.000000D+00
       MDL_CLEQT3IM = 0.000000D+00
-      MDL_CHBTIL = 0.000000D+00
       MDL_CHWBTIL = 0.000000D+00
       MDL_CUGIM = 0.000000D+00
       MDL_CTGIM = 0.000000D+00
@@ -200,10 +200,10 @@
       MDL_GF = 1.166379D-05
       AS = 1.179000D-01
       MDL_LINEARPROPCORRECTIONS = 0.000000D+00
+      MDL_YMDO = 4.670000D-03
       MDL_YME = 5.110000D-04
       MDL_YMM = 1.056600D-01
       MDL_YMTAU = 1.777000D+00
-      MDL_YMDO = 4.670000D-03
       MDL_YMUP = 2.160000D-03
       MDL_YMS = 9.300000D-02
       MDL_YMC = 1.270000D+00
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
index 24800c08c9..50496fa2bf 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
index 84ba0de9b4..7344868d8d 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -397,8 +458,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -413,8 +477,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -429,8 +496,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * amp_sv[0];
@@ -446,8 +516,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -462,8 +535,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -478,8 +554,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[3] += 1. / 2. * amp_sv[0];
@@ -494,8 +573,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 7
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[6] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
@@ -510,8 +592,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 8
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[7] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -526,8 +611,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 9
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[8] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -542,8 +630,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 10
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[9] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -560,8 +651,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 11
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[10] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[5] += 1. / 6. * amp_sv[0];
@@ -574,8 +668,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 12
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[11] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * amp_sv[0];
       jamp_sv[5] += 1. / 2. * amp_sv[0];
@@ -589,8 +686,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 13
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[12] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
       jamp_sv[5] += 1. / 2. * amp_sv[0];
@@ -603,8 +703,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 14
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[13] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
       jamp_sv[5] += 1. / 6. * amp_sv[0];
@@ -617,8 +720,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 15
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[14] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
@@ -631,8 +737,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 16
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[15] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
@@ -645,8 +754,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 17
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[16] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[1] += 1. / 6. * amp_sv[0];
@@ -659,8 +771,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 18
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[17] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -673,8 +788,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 19
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[18] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * amp_sv[0];
       jamp_sv[1] += 1. / 2. * amp_sv[0];
@@ -687,8 +805,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 20
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[19] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -703,8 +824,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 21
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[20] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 6. * amp_sv[0];
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
@@ -717,8 +841,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 22
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[21] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 2. * amp_sv[0];
       jamp_sv[8] -= 1. / 6. * amp_sv[0];
@@ -731,8 +858,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 23
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[22] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
       jamp_sv[10] += 1. / 2. * amp_sv[0];
@@ -745,8 +875,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 24
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[23] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
       jamp_sv[10] += 1. / 6. * amp_sv[0];
@@ -759,8 +892,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 25
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[24] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 6. * amp_sv[0];
       jamp_sv[6] += 1. / 2. * amp_sv[0];
@@ -773,8 +909,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 26
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[25] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * amp_sv[0];
       jamp_sv[6] += 1. / 6. * amp_sv[0];
@@ -787,8 +926,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 27
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[26] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
       jamp_sv[10] += 1. / 6. * amp_sv[0];
@@ -801,8 +943,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 28
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[27] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -815,8 +960,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 29
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[28] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 6. * amp_sv[0];
       jamp_sv[10] += 1. / 2. * amp_sv[0];
@@ -829,8 +977,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 30
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[29] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -844,8 +995,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 31
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[30] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 6. * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
@@ -858,8 +1012,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 32
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[31] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 2. * amp_sv[0];
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
@@ -872,8 +1029,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 33
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[32] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * amp_sv[0];
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
@@ -886,8 +1046,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 34
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[33] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
@@ -900,8 +1063,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 35
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[34] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
       jamp_sv[6] += 1. / 6. * amp_sv[0];
@@ -914,8 +1080,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 36
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[35] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 6. * amp_sv[0];
       jamp_sv[6] += 1. / 2. * amp_sv[0];
@@ -928,8 +1097,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 37
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[36] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * amp_sv[0];
       jamp_sv[3] -= 1. / 6. * amp_sv[0];
@@ -942,8 +1114,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 38
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[37] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -956,8 +1131,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 39
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[38] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -970,8 +1148,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 40
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[39] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -985,8 +1166,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 41
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[40] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
       jamp_sv[9] += 1. / 2. * amp_sv[0];
@@ -999,8 +1183,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 42
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[41] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
       jamp_sv[9] += 1. / 6. * amp_sv[0];
@@ -1013,8 +1200,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 43
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[42] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 6. * amp_sv[0];
       jamp_sv[7] -= 1. / 2. * amp_sv[0];
@@ -1027,8 +1217,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 44
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[43] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * amp_sv[0];
       jamp_sv[7] -= 1. / 6. * amp_sv[0];
@@ -1041,8 +1234,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 45
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[44] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 6. * amp_sv[0];
       jamp_sv[11] -= 1. / 2. * amp_sv[0];
@@ -1055,8 +1251,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 46
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[45] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * amp_sv[0];
       jamp_sv[11] -= 1. / 6. * amp_sv[0];
@@ -1069,8 +1268,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 47
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[46] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += 1. / 2. * amp_sv[0];
       jamp_sv[11] -= 1. / 6. * amp_sv[0];
@@ -1083,8 +1285,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 48
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[47] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1097,8 +1302,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 49
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[48] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[9] += 1. / 6. * amp_sv[0];
       jamp_sv[11] -= 1. / 2. * amp_sv[0];
@@ -1111,8 +1319,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 50
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[49] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1125,8 +1336,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 51
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[50] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
       jamp_sv[9] += 1. / 6. * amp_sv[0];
@@ -1139,8 +1353,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 52
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[51] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1153,8 +1370,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 53
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[52] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[8] -= 1. / 6. * amp_sv[0];
       jamp_sv[9] += 1. / 2. * amp_sv[0];
@@ -1167,8 +1387,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 54
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[53] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1181,8 +1404,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 55
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[54] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[2] += 1. / 6. * amp_sv[0];
@@ -1195,8 +1421,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 56
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[55] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1209,8 +1438,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 57
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[56] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 6. * amp_sv[0];
       jamp_sv[2] += 1. / 2. * amp_sv[0];
@@ -1223,8 +1455,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 58
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[57] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1237,8 +1472,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 59
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[58] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += 1. / 2. * amp_sv[0];
       jamp_sv[11] -= 1. / 6. * amp_sv[0];
@@ -1251,8 +1489,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 60
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[59] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1265,8 +1506,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 61
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[60] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[10] += 1. / 6. * amp_sv[0];
       jamp_sv[11] -= 1. / 2. * amp_sv[0];
@@ -1279,8 +1523,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 62
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[61] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1293,8 +1540,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 63
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[62] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[3] -= 1. / 6. * amp_sv[0];
@@ -1307,8 +1557,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 64
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[63] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1321,8 +1574,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 65
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[64] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 6. * amp_sv[0];
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
@@ -1335,8 +1591,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 66
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[65] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
@@ -1371,8 +1630,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 68
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[67] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -1387,8 +1649,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 69
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[68] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[2] += 1. / 2. * amp_sv[0];
       jamp_sv[5] -= 1. / 2. * amp_sv[0];
@@ -1425,8 +1690,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 71
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[70] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= 1. / 2. * amp_sv[0];
       jamp_sv[4] += 1. / 2. * amp_sv[0];
@@ -1441,8 +1709,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 72
       VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[71] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
       jamp_sv[4] += 1. / 2. * amp_sv[0];
@@ -1810,9 +2081,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1888,8 +2158,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1951,25 +2220,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -2014,16 +2293,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -2091,6 +2398,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -2100,6 +2408,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -2111,8 +2421,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -2140,7 +2452,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -2156,7 +2467,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -2170,11 +2481,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -2186,6 +2500,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -2194,9 +2509,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -2207,9 +2523,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -2247,40 +2569,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -2295,7 +2583,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -2343,82 +2632,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -2443,13 +2747,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -2457,13 +2755,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
index f8f13801dd..7e444f2546 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h
new file mode 100644
index 0000000000..b9f07de180
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H
+#define MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 72;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
index 065f7b4329..239e177d5b 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,27 +49,27 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
-[1;60mINFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models [0m
---2025-10-22 11:49:03--  http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz
+[1;60mINFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models [0m
+--2025-12-11 12:38:40--  http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz
 Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109
 Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected.
 HTTP request sent, awaiting response... 200 Ok
 Length: 80562 (79K) [application/x-tar]
 Saving to: ‘tmp.tgz’
 
-     0K .......... .......... .......... .......... .......... 63%  830K 0s
-    50K .......... .......... ........                        100%  124M=0.06s
+     0K .......... .......... .......... .......... .......... 63%  880K 0s
+    50K .......... .......... ........                        100% 1.37M=0.08s
 
-2025-10-22 11:49:03 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562]
+2025-12-11 12:38:40 (1018 KB/s) - ‘tmp.tgz’ saved [80562/80562]
 
 SMEFTsim_topU3l_MwScheme_UFO/
 SMEFTsim_topU3l_MwScheme_UFO/__init__.py
@@ -89,7 +90,7 @@ SMEFTsim_topU3l_MwScheme_UFO/lorentz.py
 SMEFTsim_topU3l_MwScheme_UFO/vertices.py
 SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat
 fail to load model but auto_convert_model is on True. Trying to convert the model
-convert model /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO
+convert model /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO
 retry the load of the model
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
@@ -107,7 +108,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.07803130149841309 [0m
+[1;32mDEBUG: model prefixing  takes 0.03392601013183594 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -125,33 +126,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 2.695 s
+1 processes with 72 diagrams generated in 1.441 s
 Total: 1 processes with 72 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.127 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.068 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.281 s
+ALOHA: aloha creates 5 routines in  0.148 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -161,17 +162,17 @@ ALOHA: aloha creates 5 routines in  0.281 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 quit
 
-real	0m4.417s
-user	0m3.862s
-sys	0m0.114s
-Code generation completed in 5 seconds
+real	0m4.179s
+user	0m2.251s
+sys	0m0.181s
+Code generation completed in 4 seconds
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
index 24800c08c9..50496fa2bf 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
index 8c3316992a..d7b2ade5f4 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -1758,9 +1819,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -1836,8 +1896,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -1899,25 +1958,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -1962,16 +2031,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -2039,6 +2136,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -2048,6 +2146,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -2059,8 +2159,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -2088,7 +2190,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -2104,7 +2205,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -2118,11 +2219,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -2134,6 +2238,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -2142,9 +2247,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -2155,9 +2261,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -2195,40 +2307,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -2243,7 +2321,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -2291,82 +2370,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -2391,13 +2485,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -2405,13 +2493,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
index f8f13801dd..7e444f2546 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h
new file mode 100644
index 0000000000..b9f07de180
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H
+#define MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 72;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
index 01968dc817..cab1a5820a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.071 s
+1 processes with 6 diagrams generated in 0.054 s
 Total: 1 processes with 6 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 
 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
@@ -575,52 +576,54 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s
-Wrote files for 16 helas calls in 0.065 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s
+Wrote files for 16 helas calls in 2.514 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.125 s
+ALOHA: aloha creates 3 routines in  0.105 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 6 routines in  0.118 s
+ALOHA: aloha creates 6 routines in  0.089 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.714s
-user	0m2.329s
-sys	0m0.381s
-Code generation completed in 3 seconds
+real	0m11.941s
+user	0m1.873s
+sys	0m0.654s
+Code generation completed in 12 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -641,9 +644,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -670,9 +673,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat
index 9cfb7ac1a2..0c5a1bdd83 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat
@@ -232,9 +232,9 @@ mse2 3 3 mdl_RmE23x3
 msl2 1 1 mdl_RmL21x1
 msl2 3 3 mdl_RmL23x3
 msoft 1 mdl_RMx1
+msoft 2 mdl_RMx2
 msoft 21 mdl_mHd2
 msoft 22 mdl_mHu2
-msoft 2 mdl_RMx2
 msoft 3 mdl_RMx3
 msq2 1 1 mdl_RmQ21x1
 msq2 3 3 mdl_RmQ23x3
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc
index 6acb037f00..a3d72e8ed8 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc
@@ -81,9 +81,9 @@
       MDL_RML21X1 = 3.815567D+04
       MDL_RML23X3 = 3.782868D+04
       MDL_RMX1 = 1.013965D+02
+      MDL_RMX2 = 1.915042D+02
       MDL_MHD2 = 3.233749D+04
       MDL_MHU2 = -1.288001D+05
-      MDL_RMX2 = 1.915042D+02
       MDL_RMX3 = 5.882630D+02
       MDL_RMQ21X1 = 2.998367D+05
       MDL_RMQ23X3 = 2.487654D+05
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
index c5e79dc1b1..76849a871e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_MSSM_SLHA2.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
index 89c03a7876..0b6ddbf4aa 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -400,8 +461,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
@@ -414,8 +478,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
 
@@ -427,8 +494,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 4
       VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[3] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += amp_sv[0];
 
@@ -440,8 +510,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 5
       VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[4] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
 
@@ -453,8 +526,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 6
       VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[5] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] += amp_sv[0];
 
@@ -755,9 +831,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -833,8 +908,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -896,25 +970,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -959,16 +1043,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1036,6 +1148,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1045,6 +1158,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1056,8 +1171,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1085,7 +1202,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1101,7 +1217,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1115,11 +1231,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1131,6 +1250,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1139,9 +1259,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1152,9 +1273,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1192,40 +1319,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1240,7 +1333,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1288,82 +1382,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1388,13 +1497,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1402,13 +1505,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
index 0c297072b2..bee85ff0b7 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h
new file mode 100644
index 0000000000..998cb0ade6
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H
+#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 6;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
index 0c5c2efcaf..ca077e5af2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -549,47 +550,47 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.074 s
+1 processes with 6 diagrams generated in 0.055 s
 Total: 1 processes with 6 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
-Generated helas calls for 1 subprocesses (6 diagrams) in 0.006 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
+Generated helas calls for 1 subprocesses (6 diagrams) in 0.010 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.126 s
+ALOHA: aloha creates 3 routines in  0.102 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 quit
 
-real	0m1.007s
-user	0m0.940s
-sys	0m0.062s
-Code generation completed in 1 seconds
+real	0m2.643s
+user	0m0.729s
+sys	0m0.132s
+Code generation completed in 3 seconds
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
index c5e79dc1b1..76849a871e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_MSSM_SLHA2.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
index e4718e0681..3db12d56be 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -756,9 +817,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -834,8 +894,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -897,25 +956,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -960,16 +1029,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1037,6 +1134,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1046,6 +1144,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1057,8 +1157,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1086,7 +1188,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1102,7 +1203,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1116,11 +1217,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1132,6 +1236,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1140,9 +1245,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1153,9 +1259,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1193,40 +1305,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1241,7 +1319,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1289,82 +1368,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1389,13 +1483,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1403,13 +1491,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
index 0c297072b2..bee85ff0b7 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h
new file mode 100644
index 0000000000..998cb0ade6
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H
+#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 6;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
index 463187a10a..3ece1f2ceb 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.089 s
+1 processes with 3 diagrams generated in 0.077 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt 
 INFO: remove old information in CODEGEN_mad_susy_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
-[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
+[1;34mWARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -575,49 +576,51 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s
-Wrote files for 10 helas calls in 0.076 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1564][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1588][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1589][0m [0m
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
+Wrote files for 10 helas calls in 1.977 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.123 s
+ALOHA: aloha creates 2 routines in  0.081 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.120 s
+ALOHA: aloha creates 4 routines in  0.072 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
+save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 275][0m [0m
+Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
 Type "launch" to generate events from this process, or see
-/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.218s
-user	0m2.778s
-sys	0m0.430s
-Code generation completed in 3 seconds
+real	0m11.901s
+user	0m1.762s
+sys	0m0.693s
+Code generation completed in 12 seconds
+/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s'
+  function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -638,9 +641,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -667,9 +670,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat
index 9cfb7ac1a2..0c5a1bdd83 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat
@@ -232,9 +232,9 @@ mse2 3 3 mdl_RmE23x3
 msl2 1 1 mdl_RmL21x1
 msl2 3 3 mdl_RmL23x3
 msoft 1 mdl_RMx1
+msoft 2 mdl_RMx2
 msoft 21 mdl_mHd2
 msoft 22 mdl_mHu2
-msoft 2 mdl_RMx2
 msoft 3 mdl_RMx3
 msq2 1 1 mdl_RmQ21x1
 msq2 3 3 mdl_RmQ23x3
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
index 97e103a317..a0212bfb62 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo 
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo
+#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
index f10336e42e..74463b32eb 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
@@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
 PYTHIA8_PATH=NotInstalled
-STDLIB_FLAG=
 STDLIB=-lstdc++
+STDLIB_FLAG=
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc
index 6acb037f00..a3d72e8ed8 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc
@@ -81,9 +81,9 @@
       MDL_RML21X1 = 3.815567D+04
       MDL_RML23X3 = 3.782868D+04
       MDL_RMX1 = 1.013965D+02
+      MDL_RMX2 = 1.915042D+02
       MDL_MHD2 = 3.233749D+04
       MDL_MHU2 = -1.288001D+05
-      MDL_RMX2 = 1.915042D+02
       MDL_RMX3 = 5.882630D+02
       MDL_RMQ21X1 = 2.998367D+05
       MDL_RMQ23X3 = 2.487654D+05
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
index c5e79dc1b1..76849a871e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_MSSM_SLHA2.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 98722d3089..14482e097b 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -391,8 +452,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 1
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[0] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -405,8 +469,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 2
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[1] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[0] -= amp_sv[0];
 
@@ -418,8 +485,11 @@ namespace mg5amcCpu
       // Amplitude(s) for diagram number 3
       FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      if( storeChannelWeights )
+      {
+        numerators_sv[2] += cxabs2( amp_sv[0] );
+        denominators_sv += cxabs2( amp_sv[0] );
+      }
 #endif
       jamp_sv[1] -= amp_sv[0];
 
@@ -732,9 +802,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -810,8 +879,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -873,25 +941,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -936,16 +1014,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1013,6 +1119,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1022,6 +1129,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1033,8 +1142,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1062,7 +1173,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1078,7 +1188,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1092,11 +1202,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1108,6 +1221,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1116,9 +1230,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1129,9 +1244,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1169,40 +1290,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1217,7 +1304,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1265,82 +1353,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1365,13 +1468,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1379,13 +1476,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 256c5780e4..99f978df4c 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h
new file mode 100644
index 0000000000..04a79dca0d
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H
+#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 3;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
index 9c4080b86d..88fc5f557b 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
@@ -1,3 +1,4 @@
+WARNING:root:[91mpython3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment.[0m
 [1;31mNote that this is a development version.
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
@@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
+import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -56,7 +57,7 @@ set zerowidth_tchannel F
 import model MSSM_SLHA2
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.6192381381988525 [0m
+[1;32mDEBUG: model prefixing  takes 0.23334097862243652 [0m
 INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . 
 INFO: Detect SLHA2 format. keeping restricted parameter in the param_card 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -552,45 +553,45 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.063 s
+1 processes with 3 diagrams generated in 0.074 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 177][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 182][0m [0m
+INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 226][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 227][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [1;30m[output.py at line 228][0m [0m
+INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.095 s
+ALOHA: aloha creates 2 routines in  0.068 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 quit
 
-real	0m1.922s
-user	0m1.810s
-sys	0m0.099s
-Code generation completed in 2 seconds
+real	0m3.608s
+user	0m1.150s
+sys	0m0.169s
+Code generation completed in 3 seconds
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
index 8a37d1f947..026253f354 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -49,6 +49,9 @@
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t cublasStatus_t
 #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
@@ -113,6 +116,9 @@
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
 #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
 
 #define gpuBlasStatus_t hipblasStatus_t
 #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index 5ede45b123..469edd8d9e 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -162,7 +162,7 @@ namespace mg5amcCpu
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
+    , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
 #endif
   {
@@ -220,7 +220,7 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
@@ -356,7 +356,7 @@ namespace mg5amcGpu
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
-    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
 #endif
     // Decide at runtime whether to use BLAS for color sums
@@ -476,7 +476,7 @@ namespace mg5amcGpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
-    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
 #endif
 #ifndef MGONGPU_HAS_NO_BLAS
@@ -507,7 +507,7 @@ namespace mg5amcGpu
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h
index 63c17a68fa..50a6aaef4d 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h
@@ -128,6 +128,14 @@ namespace mg5amcCpu
 #endif
     }
 
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
     // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
     // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
     static constexpr auto kernelAccessConst_s =
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
index c5e79dc1b1..76849a871e 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -13,6 +13,7 @@
 #include "CPPProcess.h"
 #include "GpuRuntime.h"
 #include "Parameters_MSSM_SLHA2.h"
+#include "processConfig.h"
 
 #include <sstream>
 
@@ -295,7 +296,8 @@ namespace mg5amcCpu
   typedef BufferBase<fptype> BufferNumerators;
 
   // The size (number of elements) per event in a memory buffer for numerators
-  constexpr size_t sizePerEventNumerators = 1;
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
 
 #ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for numerators
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
index b88ebd5b4a..63f1df1073 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
@@ -27,6 +27,7 @@
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
 #include "color_sum.h"
+#include "processConfig.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -98,6 +99,69 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
   constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
   constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
@@ -239,7 +303,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
@@ -248,7 +312,7 @@ namespace mg5amcCpu
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
@@ -344,7 +408,8 @@ namespace mg5amcCpu
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
 #endif
 #else
@@ -357,7 +422,7 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
 #endif
@@ -366,12 +431,8 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#ifdef MGONGPUCPP_GPUIMPL
-      // SCALAR channelId for the current event (CUDA)
-      unsigned int channelId = gpu_channelId( allChannelIds );
-#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
 #endif
 
@@ -729,9 +790,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
-      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
@@ -807,8 +867,7 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
 #endif /* clang-format on */
@@ -870,25 +929,35 @@ namespace mg5amcCpu
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
 #endif
-                    const fptype globaldenom ) /* clang-format on */
+                    const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= globaldenom;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
-    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
       fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
       fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
       for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
       {
-        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        totAllNumerators[ievt] += hAllNumerators[ievt];
         totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
-      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
     }
 #endif
     return;
@@ -933,16 +1002,44 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
-  select_col( int* allselcol,                    // output: color selection[nevt]
-              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
-              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
-              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
-              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     // SCALAR channelId for the current event (CUDA)
     unsigned int channelId = gpu_channelId( allChannelIds );
     // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
     if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
       if( channelId > mgOnGpu::nchannels )
@@ -1010,6 +1107,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -1019,6 +1117,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
@@ -1030,8 +1130,10 @@ namespace mg5amcCpu
 #else
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
-            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
@@ -1059,7 +1161,6 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-    using CID_ACCESS = HostAccessChannelIds;   // non-trivial access: buffer includes all events
 #endif
 #endif
 
@@ -1075,7 +1176,7 @@ namespace mg5amcCpu
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
-    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
@@ -1089,11 +1190,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-      numerators_sv = fptype_sv{ 0 };
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
       denominators_sv = fptype_sv{ 0 };
 #endif
     }
@@ -1105,6 +1209,7 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
@@ -1113,9 +1218,10 @@ namespace mg5amcCpu
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
@@ -1126,9 +1232,15 @@ namespace mg5amcCpu
     // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1166,40 +1278,6 @@ namespace mg5amcCpu
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
-      // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-      unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-      if( allChannelIds != nullptr )
-      {
-        // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
-        const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
-        uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
-#ifndef MGONGPU_CPPSIMD
-        // NB: channelIds_sv is a scalar in no-SIMD C++
-        channelId = channelIds_sv;
-#else
-        // NB: channelIds_sv is a vector in SIMD C++
-        channelId = channelIds_sv[0];    // element[0]
-        for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
-        {
-          assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
-        }
-#endif
-        assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
-        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
-        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        for( int i = 0; i < neppV; ++i )
-        {
-          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
-        }
-#endif
-      }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
       // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
@@ -1214,7 +1292,8 @@ namespace mg5amcCpu
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
         calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
@@ -1262,82 +1341,97 @@ namespace mg5amcCpu
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      // Event-by-event random choice of color #402
-      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-      {
-        if( channelId > mgOnGpu::nchannels )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-          assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-        }
-        // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-        // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-        const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-        if( iconfig <= 0 )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-          assert( iconfig > 0 ); // SANITY CHECK #917
-        }
-        else if( iconfig > (int)mgOnGpu::nconfigSDE )
-        {
-          printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-          assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-        }
-        fptype_sv targetamp[ncolor] = { 0 };
-        // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( icolC == 0 )
-            targetamp[icolC] = fptype_sv{ 0 };
-          else
-            targetamp[icolC] = targetamp[icolC - 1];
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-        }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype_sv targetamp2[ncolor] = { 0 };
-        for( int icolC = 0; icolC < ncolor; icolC++ )
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
-          if( icolC == 0 )
-            targetamp2[icolC] = fptype_sv{ 0 };
-          else
-            targetamp2[icolC] = targetamp2[icolC - 1];
-          // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-          if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
         }
-#endif
-        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-          for( int icolC = 0; icolC < ncolor; icolC++ )
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
           {
-#if defined MGONGPU_CPPSIMD
-            // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845)
-            volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
-#else
-            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
-#endif
-            if( okcol )
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV % neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
             {
-              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+              channelIdVec[ieppV] = ichan + 1;
               break;
             }
           }
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          const int ievt2 = ievt00 + ieppV + neppV;
-          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
           for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV];
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
             {
-              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-              //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 );
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
               break;
             }
           }
-#endif
         }
       }
       else
@@ -1362,13 +1456,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
@@ -1376,13 +1464,14 @@ namespace mg5amcCpu
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
-        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
         fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-        MEs_sv *= numerators_sv / denominators_sv;
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
 #endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
index 256c5780e4..99f978df4c 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
@@ -163,6 +163,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -171,6 +172,8 @@ namespace mg5amcCpu
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
@@ -187,6 +190,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
 #endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
@@ -194,6 +198,8 @@ namespace mg5amcCpu
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
 #endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h
new file mode 100644
index 0000000000..04a79dca0d
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H
+#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = 3;
+
+}
+
+#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc
new file mode 120000
index 0000000000..e603bf8fdb
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc
@@ -0,0 +1 @@
+../umami.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h
new file mode 120000
index 0000000000..1267019e18
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h
@@ -0,0 +1 @@
+../umami.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index e7360b29e2..e093865b60 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc
new file mode 100644
index 0000000000..2b52267519
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER